library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.3
## ✓ tibble 3.0.1 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tree)
## Registered S3 method overwritten by 'tree':
## method from
## print.tree cli
options(warn = 1)
set.seed(1)
carseats.df = read.csv("/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/Carseats.csv",
header=T, stringsAsFactors = F, na.strings = "?")
carseats.df <- tibble(carseats.df)
# find NAs
carseats.df %>%
filter_all(all_vars(is.na(.)))
# convert all character columns to factor
char.to.fctor <- function(df)
df %>%
mutate_if(is.character, ~ factor(.x, levels = (.x %>% table() %>% names())))
carseats.df <- char.to.fctor(carseats.df)
# Add new column High as label for all records
carseats.df$High <- ifelse(carseats.df$Sales <= 8, "No", "Yes")
# convert High column into factor
carseats.df$High <- ordered(carseats.df$High, levels = c("Yes","No"))
str(carseats.df)
## tibble [400 × 12] (S3: tbl_df/tbl/data.frame)
## $ Sales : num [1:400] 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : int [1:400] 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : int [1:400] 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: int [1:400] 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : int [1:400] 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : int [1:400] 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : int [1:400] 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : int [1:400] 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
## $ High : Ord.factor w/ 2 levels "Yes"<"No": 1 1 1 2 2 1 2 1 2 2 ...
# construct a classification tree on data
tree.carseats <- tree(High ~ .-Sales, carseats.df)
summary(tree.carseats)
##
## Classification tree:
## tree(formula = High ~ . - Sales, data = carseats.df)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "US" "Income" "CompPrice"
## [6] "Population" "Advertising" "Age"
## Number of terminal nodes: 27
## Residual mean deviance: 0.4575 = 170.7 / 373
## Misclassification error rate: 0.09 = 36 / 400
plot(tree.carseats)
text(tree.carseats, pretty=0)
tree.carseats
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 400 541.500 No ( 0.41000 0.59000 )
## 2) ShelveLoc: Good 85 90.330 Yes ( 0.77647 0.22353 )
## 4) Price < 135 68 49.260 Yes ( 0.88235 0.11765 )
## 8) US: No 17 22.070 Yes ( 0.64706 0.35294 )
## 16) Price < 109 8 0.000 Yes ( 1.00000 0.00000 ) *
## 17) Price > 109 9 11.460 No ( 0.33333 0.66667 ) *
## 9) US: Yes 51 16.880 Yes ( 0.96078 0.03922 ) *
## 5) Price > 135 17 22.070 No ( 0.35294 0.64706 )
## 10) Income < 46 6 0.000 No ( 0.00000 1.00000 ) *
## 11) Income > 46 11 15.160 Yes ( 0.54545 0.45455 ) *
## 3) ShelveLoc: Bad,Medium 315 390.600 No ( 0.31111 0.68889 )
## 6) Price < 92.5 46 56.530 Yes ( 0.69565 0.30435 )
## 12) Income < 57 10 12.220 No ( 0.30000 0.70000 )
## 24) CompPrice < 110.5 5 0.000 No ( 0.00000 1.00000 ) *
## 25) CompPrice > 110.5 5 6.730 Yes ( 0.60000 0.40000 ) *
## 13) Income > 57 36 35.470 Yes ( 0.80556 0.19444 )
## 26) Population < 207.5 16 21.170 Yes ( 0.62500 0.37500 ) *
## 27) Population > 207.5 20 7.941 Yes ( 0.95000 0.05000 ) *
## 7) Price > 92.5 269 299.800 No ( 0.24535 0.75465 )
## 14) Advertising < 13.5 224 213.200 No ( 0.18304 0.81696 )
## 28) CompPrice < 124.5 96 44.890 No ( 0.06250 0.93750 )
## 56) Price < 106.5 38 33.150 No ( 0.15789 0.84211 )
## 112) Population < 177 12 16.300 No ( 0.41667 0.58333 )
## 224) Income < 60.5 6 0.000 No ( 0.00000 1.00000 ) *
## 225) Income > 60.5 6 5.407 Yes ( 0.83333 0.16667 ) *
## 113) Population > 177 26 8.477 No ( 0.03846 0.96154 ) *
## 57) Price > 106.5 58 0.000 No ( 0.00000 1.00000 ) *
## 29) CompPrice > 124.5 128 150.200 No ( 0.27344 0.72656 )
## 58) Price < 122.5 51 70.680 Yes ( 0.50980 0.49020 )
## 116) ShelveLoc: Bad 11 6.702 No ( 0.09091 0.90909 ) *
## 117) ShelveLoc: Medium 40 52.930 Yes ( 0.62500 0.37500 )
## 234) Price < 109.5 16 7.481 Yes ( 0.93750 0.06250 ) *
## 235) Price > 109.5 24 32.600 No ( 0.41667 0.58333 )
## 470) Age < 49.5 13 16.050 Yes ( 0.69231 0.30769 ) *
## 471) Age > 49.5 11 6.702 No ( 0.09091 0.90909 ) *
## 59) Price > 122.5 77 55.540 No ( 0.11688 0.88312 )
## 118) CompPrice < 147.5 58 17.400 No ( 0.03448 0.96552 ) *
## 119) CompPrice > 147.5 19 25.010 No ( 0.36842 0.63158 )
## 238) Price < 147 12 16.300 Yes ( 0.58333 0.41667 )
## 476) CompPrice < 152.5 7 5.742 Yes ( 0.85714 0.14286 ) *
## 477) CompPrice > 152.5 5 5.004 No ( 0.20000 0.80000 ) *
## 239) Price > 147 7 0.000 No ( 0.00000 1.00000 ) *
## 15) Advertising > 13.5 45 61.830 Yes ( 0.55556 0.44444 )
## 30) Age < 54.5 25 25.020 Yes ( 0.80000 0.20000 )
## 60) CompPrice < 130.5 14 18.250 Yes ( 0.64286 0.35714 )
## 120) Income < 100 9 12.370 No ( 0.44444 0.55556 ) *
## 121) Income > 100 5 0.000 Yes ( 1.00000 0.00000 ) *
## 61) CompPrice > 130.5 11 0.000 Yes ( 1.00000 0.00000 ) *
## 31) Age > 54.5 20 22.490 No ( 0.25000 0.75000 )
## 62) CompPrice < 122.5 10 0.000 No ( 0.00000 1.00000 ) *
## 63) CompPrice > 122.5 10 13.860 No ( 0.50000 0.50000 )
## 126) Price < 125 5 0.000 Yes ( 1.00000 0.00000 ) *
## 127) Price > 125 5 0.000 No ( 0.00000 1.00000 ) *
Tree Deviance is defined as :\(-2\sum_{m=1}^\overline{T_{0}} \sum_{k=1}^K n_{m,k} \ln \widehat{p}_{m,k}\)
Residual mean devince is: \(\frac{-2\sum_{m=1}^\overline{T_{0}} \sum_{k=1}^K n_{m,k} \ln \widehat{p}_{m,k}}{N - |T_{0}|}\)
Where N is Number of observations.
Note that values for each branch are as follows:
ShelveLoc: Good \((split\,criterion)\) 85 \((No\,of\,observation\,in\,the\,branch)\) 90.330 \((deviance)\) Yes \((overall\,prediction\,for\,the\,branch)\) (0.77647 0.22353 ) \((fractions\,of\,observation\, classifications\,for\,the\,branch)\)
library(tidyverse)
library(tree)
options(warn = 1)
set.seed(1)
carseats.df = read.csv("/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/Carseats.csv",
header=T, stringsAsFactors = F, na.strings = "?")
carseats.df <- tibble(carseats.df)
# find NAs
carseats.df %>%
filter_all(all_vars(is.na(.)))
# convert all character columns to factor
char.to.fctor <- function(df)
df %>%
mutate_if(is.character, ~ factor(.x, levels = (.x %>% table() %>% names())))
carseats.df <- char.to.fctor(carseats.df)
# Add new column High as label for all records
carseats.df$High <- ifelse(carseats.df$Sales <= 8, "No", "Yes")
# convert High column into factor
carseats.df$High <- factor(carseats.df$High, levels = c("Yes","No"))
set.seed(1113)
no.of.train <- ceiling(0.7 * nrow(carseats.df))
no.of.test <- nrow(carseats.df) - no.of.train
train.idx <- sample(seq_len(nrow(carseats.df)), no.of.train)
test.idx <- setdiff(seq_len(nrow(carseats.df)), train.idx)
# build classification tree on train data
tree.carseats <- tree(High ~ .-Sales, carseats.df[train.idx,])
plot(tree.carseats)
text(tree.carseats, pretty = 0)
# do the predoiction on test data (type="class" makes R return class predictions):
pred.probs <- predict(tree.carseats,carseats.df[test.idx,])
tree.pred <- predict(tree.carseats, carseats.df[test.idx,], type = "class")
# see the confusion matrix
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
(confusion_table <- table(tree.pred, carseats.df[test.idx,]$High))
##
## tree.pred Yes No
## Yes 33 8
## No 22 57
nullClassifier <- max(
(confusion_table[1,1] + confusion_table[2,1])/(confusion_table[1,1] + confusion_table[2,1]+ confusion_table[1,2] + confusion_table[2,2] ),
(confusion_table[1,2] + confusion_table[2,2])/(confusion_table[1,1] + confusion_table[2,1]+ confusion_table[1,2] + confusion_table[2,2] ))
roc_obj <- roc(unlist(carseats.df[test.idx,]$High), pred.probs[, 1])
## Setting levels: control = Yes, case = No
## Setting direction: controls > cases
# Let's draw some AUC plots
plot(roc_obj, legacy.axes = TRUE)
missclassificationRate <- mean(tree.pred != carseats.df[test.idx,]$High)
FP_rates <- confusion_table[2,1]/(confusion_table[2,1]+ confusion_table[1,1])
TP_rates <- confusion_table[2,2]/(confusion_table[2,2]+ confusion_table[1,2])
precisions <- confusion_table[2,2] / (confusion_table[2,2] + confusion_table[2,1])
specificities <- 1 - confusion_table[2,1]/(confusion_table[2,1]+ confusion_table[1,1])
classification.rate <- (confusion_table[2,2] + confusion_table[1,1]) /
(confusion_table[1,1]+ confusion_table[1,2] + confusion_table[2,1]+ confusion_table[2,2])
# overall fraction of wrong predictions:
# print(confusion_table)
sprintf("Null Classifier error rate : %s", 1 - nullClassifier)
## [1] "Null Classifier error rate : 0.458333333333333"
# average missclassification error rate
sprintf("tree classifier : Missclassification error rate : %s", missclassificationRate)
## [1] "tree classifier : Missclassification error rate : 0.25"
# FP rate:
sprintf("tree classifier : FP rate (TypeI error, 1 - specificity) : %s", FP_rates)
## [1] "tree classifier : FP rate (TypeI error, 1 - specificity) : 0.4"
# tree classification
print(glue::glue("Tree classification rate: ", classification.rate))
## Tree classification rate: 0.75
# Null classifier rate
sprintf("Null Classifier rate: %s", nullClassifier)
## [1] "Null Classifier rate: 0.541666666666667"
# TP rate:
sprintf("tree classifier : TP rate (1-TypeII error, power, sensetivity, recall) : %s", TP_rates)
## [1] "tree classifier : TP rate (1-TypeII error, power, sensetivity, recall) : 0.876923076923077"
# precision:
sprintf("tree classifier : precision: %s", precisions)
## [1] "tree classifier : precision: 0.721518987341772"
# specificity 1-FP/N:
sprintf("tree classifier : specificity 1-FP/N: %s", specificities)
## [1] "tree classifier : specificity 1-FP/N: 0.6"
# -----------------------------------------------------------------------------
# ---------- Lets prune the tree to see if we get better results ---------------
cv.carseats <- cv.tree(object = tree.carseats, FUN = prune.misclass, K = 10)
# Number of terminal nodes of each tree that CV considered
cv.carseats$size
## [1] 23 19 15 13 9 7 5 2 1
# Error rate corresponding to each tree that CV considered
cv.carseats$dev
## [1] 83 82 82 84 83 85 89 98 111
# value of cost complexity parameter alpha that corresponds to each tree considered by CV
cv.carseats$k
## [1] -Inf 0.000000 0.500000 1.000000 2.000000 3.000000 5.000000
## [8] 9.666667 25.000000
# plot the missclassification error rate as the function of size and k
plot(cv.carseats$size, cv.carseats$dev, type = 'b')
plot(cv.carseats$k, cv.carseats$dev, type = 'b')
# seems like 9 is the best size of the tree, let's prune it to get to tree with size 9
prune.carseats <- prune.misclass(tree.carseats, best = 9)
#original data
str(carseats.df)
## tibble [400 × 12] (S3: tbl_df/tbl/data.frame)
## $ Sales : num [1:400] 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : int [1:400] 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : int [1:400] 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: int [1:400] 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : int [1:400] 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : int [1:400] 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : int [1:400] 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : int [1:400] 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
## $ High : Factor w/ 2 levels "Yes","No": 1 1 1 2 2 1 2 1 2 2 ...
#pruned tree
summary(prune.carseats)
##
## Classification tree:
## snip.tree(tree = tree.carseats, nodes = c(26L, 54L, 49L, 55L,
## 2L, 96L))
## Variables actually used in tree construction:
## [1] "Price" "ShelveLoc" "Age" "Advertising" "CompPrice"
## Number of terminal nodes: 9
## Residual mean deviance: 0.7524 = 203.9 / 271
## Misclassification error rate: 0.1393 = 39 / 280
plot(prune.carseats)
text(prune.carseats, pretty = 0)
# let's run it on test set to see the ,iss classification rate again:
# do the predoiction on test data (type="class" makes R return class predictions):
pred.probs <- predict(prune.carseats,carseats.df[test.idx,])
tree.pred <- predict(prune.carseats, carseats.df[test.idx,], type = "class")
# see the confusion matrix
library(pROC)
(confusion_table <- table(tree.pred, carseats.df[test.idx,]$High))
##
## tree.pred Yes No
## Yes 33 13
## No 22 52
nullClassifier <- max(
(confusion_table[1,1] + confusion_table[2,1])/(confusion_table[1,1] + confusion_table[2,1]+ confusion_table[1,2] + confusion_table[2,2] ),
(confusion_table[1,2] + confusion_table[2,2])/(confusion_table[1,1] + confusion_table[2,1]+ confusion_table[1,2] + confusion_table[2,2] ))
roc_obj <- roc(unlist(carseats.df[test.idx,]$High), pred.probs[, 1])
## Setting levels: control = Yes, case = No
## Setting direction: controls > cases
# Let's draw some AUC plots
plot(roc_obj, legacy.axes = TRUE)
missclassificationRate <- mean(tree.pred != carseats.df[test.idx,]$High)
FP_rates <- confusion_table[2,1]/(confusion_table[2,1]+ confusion_table[1,1])
TP_rates <- confusion_table[2,2]/(confusion_table[2,2]+ confusion_table[1,2])
precisions <- confusion_table[2,2] / (confusion_table[2,2] + confusion_table[2,1])
specificities <- 1 - confusion_table[2,1]/(confusion_table[2,1]+ confusion_table[1,1])
classification.rate <- (confusion_table[2,2] + confusion_table[1,1]) /
(confusion_table[1,1]+ confusion_table[1,2] + confusion_table[2,1]+ confusion_table[2,2])
# overall fraction of wrong predictions:
# print(confusion_table)
sprintf("Null Classifier error rate : %s", 1 - nullClassifier)
## [1] "Null Classifier error rate : 0.458333333333333"
# average missclassification error rate
sprintf("tree classifier : Missclassification error rate : %s", missclassificationRate)
## [1] "tree classifier : Missclassification error rate : 0.291666666666667"
# FP rate:
sprintf("tree classifier : FP rate (TypeI error, 1 - specificity) : %s", FP_rates)
## [1] "tree classifier : FP rate (TypeI error, 1 - specificity) : 0.4"
# tree classification
print(glue::glue("Tree classification rate: ", classification.rate))
## Tree classification rate: 0.708333333333333
# Null classifier rate
sprintf("Null Classifier: %s", nullClassifier)
## [1] "Null Classifier: 0.541666666666667"
# TP rate:
sprintf("tree classifier : TP rate (1-TypeII error, power, sensetivity, recall) : %s", TP_rates)
## [1] "tree classifier : TP rate (1-TypeII error, power, sensetivity, recall) : 0.8"
# precision:
sprintf("tree classifier : precision: %s", precisions)
## [1] "tree classifier : precision: 0.702702702702703"
# specificity 1-FP/N:
sprintf("tree classifier : specificity 1-FP/N: %s", specificities)
## [1] "tree classifier : specificity 1-FP/N: 0.6"
Remember to each given value of \(\alpha\) (k in R) we assign a subtree \(T_{\alpha}\) as below:
Using Gini index:
\(T_{\alpha} = \underset{T \subset T_{0}}{\operatorname{argMin}} \{\sum\limits_{i=1}^{|T|} \sum\limits_{k=1}^{K} \hat{p_{m,k}} \cdot (1 - \hat{p_{m,k})} + \alpha \cdot |T| \space\space | T \subset T_{0} \}\)
or using Cross Entropy:
\(T_{\alpha} = \underset{T \subset T_{0}}{\operatorname{argMin}} \{\sum\limits_{i=1}^{|T|} \sum\limits_{k=1}^{K} \hat{-p_{m,k}} \cdot \log (\hat{p_{m,k}}) + \alpha \cdot |T| \space\space | T \subset T_{0} \}\)
And then we use CV to find few good values of \(\alpha\) and their corresponding \(T_{\alpha}\) based on miss classification error rate (FUN = prune.misclass).
library(tidyverse)
library(dataPreparation)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: progress
## dataPreparation 0.4.3
## Type dataPrepNews() to see new features/changes/bug fixes.
library(tree)
boston.df = read.csv(
"/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/BostonHousing.csv",
header=T, stringsAsFactors = F, na.strings = "?")
boston.df <- tibble(boston.df)
# see if there is any NA in any records
na.idx <- which(is.na(boston.df))
# No NA anywhere , let's do usual clean up:
# remove constant variables
constant_cols <- whichAreConstant(boston.df)
## [1] "whichAreConstant: it took me 0.01s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(boston.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(boston.df))
## [1] "whichAreBijection: it took me 0.14s to identify 0 column(s) to drop."
## integer(0)
boston.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) boston.df[- bijections_cols] else boston.df
str(boston.df)
## Classes 'data.table' and 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : int 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
set.seed(1113)
no.of.train <- ceiling(0.7 * nrow(boston.df))
no.of.test <- nrow(boston.df) - no.of.train
train.idx <- sample(seq_len(nrow(boston.df)), no.of.train)
test.idx <- setdiff(seq_len(nrow(boston.df)), train.idx)
tree.boston <- tree(medv ~ ., boston.df, subset = train.idx)
plot(tree.boston)
text(tree.boston, pretty = 0)
tree.boston
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 355 31270.0 22.66
## 2) rm < 6.8375 295 12020.0 19.70
## 4) lstat < 15 182 4907.0 22.95
## 8) lstat < 9.68 97 3357.0 25.16
## 16) age < 89.45 92 1374.0 24.26
## 32) rm < 6.5445 61 633.1 22.72 *
## 33) rm > 6.5445 31 313.6 27.28 *
## 17) age > 89.45 5 507.2 41.90 *
## 9) lstat > 9.68 85 528.1 20.42 *
## 5) lstat > 15 113 2091.0 14.46
## 10) crim < 5.76921 53 633.1 17.34 *
## 11) crim > 5.76921 60 631.8 11.92 *
## 3) rm > 6.8375 60 3914.0 37.24
## 6) rm < 7.445 39 1087.0 32.64 *
## 7) rm > 7.445 21 475.9 45.77 *
# let's draw the residuals pdf, very close to standard normal
plot(density(summary(tree.boston)$residuals))
# deviance is sum of squared error for the tree
# do the predoiction on test data
pred.values <- predict(tree.boston, newdata = boston.df[test.idx,])
obs.values <- boston.df[test.idx,]$medv
# find the R^2 for test data
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.705
# now plot the observed vs predicted on test data
plot.data <- data.frame("Pred.values" = pred.values, "obs.values" = obs.values,
"DataSet" = "test")
residuals <- obs.values - pred.values
plot.residuals <- data.frame("x" = obs.values, "y" = pred.values,
"x1" = obs.values, "y2" = pred.values + residuals,
"DataSet" = "test")
ggplot() +
# plot test samples
geom_point(data = plot.data,
aes(x = obs.values, y = pred.values, color = DataSet)) +
# plot residuals
geom_segment(data = plot.residuals, alpha = 0.2,
aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
# plot optimal regressor
geom_abline(color = "blue", slope = 1)
# plot the residuals
plot(density(residuals))
plot(pred.values, obs.values)
abline(0,1)
# tetMSE
mean((obs.values - pred.values)^2)
## [1] 24.43265
#---------------------------------------------------------
# Now let's prune the tree to see if it performs better
# --------------------------------------------------------
cv.boston <- cv.tree(tree.boston, K = 10)
plot(cv.boston$size, cv.boston$dev, type='b')
# The deviance is lowest at 7, so let's construct a pruned tree with 7 nodes
prune.boston <- prune.tree(tree = tree.boston, best=7)
# summary of pruned tree
summary(prune.boston)
##
## Regression tree:
## snip.tree(tree = tree.boston, nodes = 16L)
## Variables actually used in tree construction:
## [1] "rm" "lstat" "age" "crim"
## Number of terminal nodes: 7
## Residual mean deviance: 15.05 = 5237 / 348
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -14.9000 -2.2550 -0.1554 0.0000 2.2500 17.3600
plot(prune.boston)
text(prune.boston, pretty = 0)
# let's draw the residuals pdf for pruned tree
plot(density(summary(prune.boston)$residuals))
# do the predoiction on test data
pred.values <- predict(prune.boston, newdata = boston.df[test.idx,])
obs.values <- boston.df[test.idx,]$medv
# find the R^2 for test data
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.694
# now plot the observed vs predicted on test data
plot.data <- data.frame("Pred.values" = pred.values, "obs.values" = obs.values,
"DataSet" = "test")
residuals <- obs.values - pred.values
plot.residuals <- data.frame("x" = obs.values, "y" = pred.values,
"x1" = obs.values, "y2" = pred.values + residuals,
"DataSet" = "test")
ggplot() +
# plot test samples
geom_point(data = plot.data,
aes(x = obs.values, y = pred.values, color = DataSet)) +
# plot residuals
geom_segment(data = plot.residuals, alpha = 0.2,
aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
# plot optimal regressor
geom_abline(color = "blue", slope = 1)
# plot the residuals
plot(density(residuals))
plot(pred.values, obs.values)
abline(0,1)
# tetMSE
mean((obs.values - pred.values)^2)
## [1] 25.29452
# seemed like pruning downgraded the performance of the model
# So pruning the tree does not add to performance
# From another perspective since testMSE ~ 25 then we can infere that
# real house price is 5k around home value in suberb
library(tidyverse)
library(dataPreparation)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
boston.df = read.csv(
"/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/BostonHousing.csv",
header=T, stringsAsFactors = F, na.strings = "?")
boston.df <- tibble(boston.df)
# see if there is any NA in any records
na.idx <- which(is.na(boston.df))
# No NA anywhere , let's do usual clean up:
# remove constant variables
constant_cols <- whichAreConstant(boston.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(boston.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(boston.df))
## [1] "whichAreBijection: it took me 0.05s to identify 0 column(s) to drop."
## integer(0)
boston.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) boston.df[- bijections_cols] else boston.df
set.seed(1113)
no.of.train <- ceiling(0.7 * nrow(boston.df))
no.of.test <- nrow(boston.df) - no.of.train
train.idx <- sample(seq_len(nrow(boston.df)), no.of.train)
test.idx <- setdiff(seq_len(nrow(boston.df)), train.idx)
# ---------------------------- Bagging ---------------------------
# Bagging (mtry is number of predictors should be considered for each split )
bag.boston <- randomForest(medv ~ ., data = boston.df,subset = train.idx,
mtry=ncol(boston.df)-1, importance=T)
# importance of each predictor
# % IncMSE : Increase in out of bag MSE When given predictor excluded from model
# IncNodePurity : Decrease on training RSS (regression) or deviance (classification)
# in each split
importance(bag.boston)
## %IncMSE IncNodePurity
## crim 12.701304 1103.39712
## zn 2.039922 25.44568
## indus 7.278869 157.72731
## chas 2.124328 49.42790
## nox 22.919938 611.15533
## rm 64.312074 16619.62955
## age 8.083396 329.63566
## dis 26.378734 2362.84953
## rad 3.319187 91.67949
## tax 19.010440 556.55256
## ptratio 15.070773 356.08397
## b 4.879778 246.14704
## lstat 31.459875 8339.57390
# plot importance predictors
varImpPlot(bag.boston)
# do the predoiction on test data
pred.values <- predict(bag.boston, newdata = boston.df[test.idx,])
obs.values <- boston.df[test.idx,]$medv
# find the R^2 for test data (Bagging)
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.813
# now plot the observed vs predicted on test data
plot.data <- data.frame("pred.values" = pred.values, "obs.values" = obs.values,
"DataSet" = "test")
residuals <- obs.values - pred.values
plot.residuals <- data.frame("x" = obs.values, "y" = pred.values,
"x1" = obs.values, "y2" = pred.values + residuals,
"DataSet" = "test")
ggplot() +
# plot test samples
geom_point(data = plot.data,
aes(x = obs.values, y = pred.values, color = DataSet)) +
# plot residuals
geom_segment(data = plot.residuals, alpha = 0.2,
aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
# plot optimal regressor
geom_abline(color = "blue", slope = 1)
# plot the residuals
plot(density(residuals))
plot(pred.values, obs.values)
abline(0,1)
# tetMSE (Bagging)
mean((obs.values - pred.values)^2)
## [1] 14.26849
# ---------------------------- Bagging with 25 trees ---------------------------
# change number of trees in bagging or random forest using ntree parameter
# Bagging (mtry is number of predictors should be considered for each split )
bag.boston.25 <- randomForest(medv ~ ., data = boston.df,subset = train.idx,
mtry=ncol(boston.df)-1, importance=T, ntree = 25)
bag.boston.25
##
## Call:
## randomForest(formula = medv ~ ., data = boston.df, mtry = ncol(boston.df) - 1, importance = T, ntree = 25, subset = train.idx)
## Type of random forest: regression
## Number of trees: 25
## No. of variables tried at each split: 13
##
## Mean of squared residuals: 13.99809
## % Var explained: 84.11
# importance of each predictor
# % IncMSE : Increase in out of bag MSE When given predictor excluded from model
# IncNodePurity : Decrease on training RSS (regression) or deviance (classification)
# in each split
importance(bag.boston.25)
## %IncMSE IncNodePurity
## crim 2.82644654 869.24283
## zn -0.18898953 20.60689
## indus 2.05009318 125.61061
## chas 0.67556199 73.26221
## nox 5.76321724 564.08186
## rm 17.00079669 17959.80899
## age 3.85258734 245.62522
## dis 8.05348940 2158.97998
## rad -0.05974703 78.91323
## tax 7.62804930 615.26243
## ptratio 5.06408048 408.50769
## b 2.62026016 234.92618
## lstat 7.95259625 6772.36098
# plot importance predictors
varImpPlot(bag.boston.25)
# do the predoiction on test data
pred.values <- predict(bag.boston.25, newdata = boston.df[test.idx,])
obs.values <- boston.df[test.idx,]$medv
# find the R^2 for test data (Bagging with 25 trees)
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.771
# now plot the observed vs predicted on test data
plot.data <- data.frame("pred.values" = pred.values, "obs.values" = obs.values,
"DataSet" = "test")
residuals <- obs.values - pred.values
plot.residuals <- data.frame("x" = obs.values, "y" = pred.values,
"x1" = obs.values, "y2" = pred.values + residuals,
"DataSet" = "test")
ggplot() +
# plot test samples
geom_point(data = plot.data,
aes(x = obs.values, y = pred.values, color = DataSet)) +
# plot residuals
geom_segment(data = plot.residuals, alpha = 0.2,
aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
# plot optimal regressor
geom_abline(color = "blue", slope = 1)
# plot the residuals
plot(density(residuals))
plot(pred.values, obs.values)
abline(0,1)
# tetMSE (Bagging with 25 trees)
mean((obs.values - pred.values)^2)
## [1] 17.85693
# ------------------------ Random forest ---------------------------------
# By Default :
# mtry = sqrt(Number of features) For classification
# mtry = one third of number of features For Regression
rf.boston <- randomForest(medv ~ ., data = boston.df,subset = train.idx,
mtry=6, importance=T)
rf.boston
##
## Call:
## randomForest(formula = medv ~ ., data = boston.df, mtry = 6, importance = T, subset = train.idx)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 6
##
## Mean of squared residuals: 11.30472
## % Var explained: 87.17
# importance of each predictor
# % IncMSE : Increase in out of bag MSE When given predictor excluded from model
# IncNodePurity : Decrease on training RSS (regression) or deviance (classification)
# in each split
importance(rf.boston)
## %IncMSE IncNodePurity
## crim 13.457600 1584.57747
## zn 3.783388 115.53530
## indus 9.057431 1598.85331
## chas 1.357703 66.68147
## nox 14.026089 1467.71364
## rm 35.616616 10690.72056
## age 9.136849 674.94388
## dis 17.149022 2240.78477
## rad 4.067559 153.45064
## tax 12.664201 805.21803
## ptratio 11.523643 1595.11319
## b 7.572004 403.50368
## lstat 29.831143 9378.69053
# plot importance predictors
# across all of the rees considered in the random forest rm and lstat are by far
# most important features
varImpPlot(rf.boston)
# do the predoiction on test data
pred.values <- predict(rf.boston, newdata = boston.df[test.idx,])
obs.values <- boston.df[test.idx,]$medv
# find the R^2 for test data (Random Forest with 6 predictors)
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.872
# now plot the observed vs predicted on test data
plot.data <- data.frame("pred.values" = pred.values, "obs.values" = obs.values,
"DataSet" = "test")
residuals <- obs.values - pred.values
plot.residuals <- data.frame("x" = obs.values, "y" = pred.values,
"x1" = obs.values, "y2" = pred.values + residuals,
"DataSet" = "test")
ggplot() +
# plot test samples
geom_point(data = plot.data,
aes(x = obs.values, y = pred.values, color = DataSet)) +
# plot residuals
geom_segment(data = plot.residuals, alpha = 0.2,
aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
# plot optimal regressor
geom_abline(color = "blue", slope = 1)
# plot the residuals
plot(density(residuals))
plot(pred.values, obs.values)
abline(0,1)
# tetMSE (Random Forest with 6 predictors)
mean((obs.values - pred.values)^2)
## [1] 10.01629
library(tidyverse)
library(dataPreparation)
library(gbm)
## Loaded gbm 2.1.8
boston.df = read.csv(
"/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/BostonHousing.csv",
header=T, stringsAsFactors = F, na.strings = "?")
boston.df <- tibble(boston.df)
# see if there is any NA in any records
na.idx <- which(is.na(boston.df))
# No NA anywhere , let's do usual clean up:
# remove constant variables
constant_cols <- whichAreConstant(boston.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(boston.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(boston.df))
## [1] "whichAreBijection: it took me 0.06s to identify 0 column(s) to drop."
## integer(0)
caravan.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) boston.df[- bijections_cols] else boston.df
set.seed(1113)
no.of.train <- ceiling(0.7 * nrow(boston.df))
no.of.test <- nrow(boston.df) - no.of.train
train.idx <- sample(seq_len(nrow(boston.df)), no.of.train)
test.idx <- setdiff(seq_len(nrow(boston.df)), train.idx)
# 5000 trees each with depth of 4, default value for shrinkage parameter is 0.001
# lambda (Algorithm 8.2 page 322)
boost.boston <- gbm(medv ~ ., data = boston.df[train.idx, ],
distribution = "gaussian", n.trees = 5000,
interaction.depth = 4)
# rm and lstat are most important features
summary(boost.boston)
# partial dependence plots:
# marginal affect of selected feature on response after interating out other features
plot(boost.boston, i="rm")
# median house price is decreasing with lstat
plot(boost.boston, i="lstat")
# do the predoiction on test data
pred.values <- predict(boost.boston, newdata = boston.df[test.idx,], n.trees = 5000)
obs.values <- boston.df[test.idx,]$medv
# find the R^2 for test data (Random Forest with 6 predictors)
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.822
# now plot the observed vs predicted on test data
plot.data <- data.frame("pred.values" = pred.values, "obs.values" = obs.values,
"DataSet" = "test")
residuals <- obs.values - pred.values
plot.residuals <- data.frame("x" = obs.values, "y" = pred.values,
"x1" = obs.values, "y2" = pred.values + residuals,
"DataSet" = "test")
ggplot() +
# plot test samples
geom_point(data = plot.data,
aes(x = obs.values, y = pred.values, color = DataSet)) +
# plot residuals
geom_segment(data = plot.residuals, alpha = 0.2,
aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
# plot optimal regressor
geom_abline(color = "blue", slope = 1)
# plot the residuals
plot(density(residuals))
plot(pred.values, obs.values)
abline(0,1)
# tetMSE (Random Forest with 6 predictors)
mean((obs.values - pred.values)^2)
## [1] 13.88032
# ------------------------------------------------------------------
# boosting with specefic shrinkage parameter lambda
# ------------------------------------------------------------------
# 5000 trees each with depth of 4, default value for shrinkage parameter is 0.2
# lambda (Algorithm 8.2 page 322)
boost.boston <- gbm(medv ~ ., data = boston.df[train.idx, ],
distribution = "gaussian", n.trees = 5000,
interaction.depth = 4, shrinkage = 0.2)
# rm and lstat are most important features
summary(boost.boston)
# partial dependence plots:
# marginal affect of selected feature on response after interating out other features
plot(boost.boston, i="rm")
# median house price is decreasing with lstat
plot(boost.boston, i="lstat")
# do the predoiction on test data
pred.values <- predict(boost.boston, newdata = boston.df[test.idx,], n.trees = 5000)
obs.values <- boston.df[test.idx,]$medv
# find the R^2 for test data (Random Forest with 6 predictors)
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.811
# now plot the observed vs predicted on test data
plot.data <- data.frame("pred.values" = pred.values, "obs.values" = obs.values,
"DataSet" = "test")
residuals <- obs.values - pred.values
plot.residuals <- data.frame("x" = obs.values, "y" = pred.values,
"x1" = obs.values, "y2" = pred.values + residuals,
"DataSet" = "test")
ggplot() +
# plot test samples
geom_point(data = plot.data,
aes(x = obs.values, y = pred.values, color = DataSet)) +
# plot residuals
geom_segment(data = plot.residuals, alpha = 0.2,
aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
# plot optimal regressor
geom_abline(color = "blue", slope = 1)
# plot the residuals
plot(density(residuals))
plot(pred.values, obs.values)
abline(0,1)
# tetMSE (Random Forest with 6 predictors)
mean((obs.values - pred.values)^2)
## [1] 14.86174
library(tidyverse)
library(dataPreparation)
library(randomForest)
library(tree)
boston.df = read.csv(
"/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/BostonHousing.csv",
header=T, stringsAsFactors = F, na.strings = "?")
boston.df <- tibble(boston.df)
# see if there is any NA in any records
na.idx <- which(is.na(boston.df))
# No NA anywhere , let's do usual clean up:
# remove constant variables
constant_cols <- whichAreConstant(boston.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(boston.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(boston.df))
## [1] "whichAreBijection: it took me 0.05s to identify 0 column(s) to drop."
## integer(0)
boston.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) boston.df[- bijections_cols] else boston.df
set.seed(1113)
no.of.train <- ceiling(0.7 * nrow(boston.df))
no.of.test <- nrow(boston.df) - no.of.train
train.idx <- sample(seq_len(nrow(boston.df)), no.of.train)
test.idx <- setdiff(seq_len(nrow(boston.df)), train.idx)
# ------------------------ Random forest ---------------------------------
# By Default :
# mtry = sqrt(Number of features) For classification
# mtry = one third of number of features For Regression
plot.data.trees <- tibble("No.Of.Trees" = NULL, "test.MSE" = NULL, "R^2" = NULL)
for (no.of.trees in 10:100){
rf.boston <- randomForest(medv ~ ., data = boston.df,subset = train.idx,
n.trees = no.of.trees, importance=F)
pred.values <- predict(rf.boston, newdata = boston.df[test.idx,])
obs.values <- boston.df[test.idx,]$medv
R.squared <- round((cor(pred.values, obs.values) ^ 2), 3)
test.MSE <- mean((obs.values - pred.values)^2)
plot.data.trees <- rbind(plot.data.trees,
tibble("No.Of.Trees" = no.of.trees,
"test.MSE" = test.MSE,
"R^2" = R.squared))
}
ggplot() +
geom_line(data = plot.data.trees,
aes(x = No.Of.Trees, y = test.MSE)) +
labs(title="No of trees and test MSE")
# Number of trees that makes test.MSE minimim in range of 100 trees
plot.data.trees[which.min(plot.data.trees$test.MSE), ]
# --------------------------- mtry ------------------------------- #
plot.data.mtry <- tibble("mtry" = NULL, "test.MSE" = NULL, "R^2" = NULL)
for (m.try in 2:ncol(boston.df) - 1){
rf.boston <- randomForest(medv ~ ., data = boston.df,subset = train.idx,
mtry = m.try, importance=F)
pred.values <- predict(rf.boston, newdata = boston.df[test.idx,])
obs.values <- boston.df[test.idx,]$medv
R.squared <- round((cor(pred.values, obs.values) ^ 2), 3)
test.MSE <- mean((obs.values - pred.values)^2)
plot.data.mtry <- rbind(plot.data.mtry,
tibble("mtry" = m.try,
"test.MSE" = test.MSE,
"R^2" = R.squared))
}
ggplot() +
geom_line(data = plot.data.mtry,
aes(x = mtry, y = test.MSE)) +
labs(title="mtry and test MSE")
# value of mtry that makes test.MSE minimim in range of 100 trees
plot.data.mtry[which.min(plot.data.mtry$test.MSE), ]
# over all the best result obtained for number of predictors = 4 and No of trees = 18:
rf.boston <- randomForest(medv ~ ., data = boston.df,subset = train.idx,
mtry = 4, n.trees = 18, importance=F)
pred.values <- predict(rf.boston, newdata = boston.df[test.idx,])
obs.values <- boston.df[test.idx,]$medv
# R sruared:
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.883
# testMSE:
(test.MSE <- mean((obs.values - pred.values)^2))
## [1] 9.542795
# now plot the observed vs predicted on test data
plot.data <- data.frame("pred.values" = pred.values, "obs.values" = obs.values,
"DataSet" = "test")
residuals <- obs.values - pred.values
plot.residuals <- data.frame("x" = obs.values, "y" = pred.values,
"x1" = obs.values, "y2" = pred.values + residuals,
"DataSet" = "test")
ggplot() +
# plot test samples
geom_point(data = plot.data,
aes(x = obs.values, y = pred.values, color = DataSet)) +
# plot residuals
geom_segment(data = plot.residuals, alpha = 0.2,
aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
# plot optimal regressor
geom_abline(color = "blue", slope = 1)
library(tidyverse)
library(tree)
library(randomForest)
library(dataPreparation)
options(warn = 1)
set.seed(1)
carseats.df = read.csv("/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/Carseats.csv",
header=T, stringsAsFactors = F, na.strings = "?")
carseats.df <- tibble(carseats.df)
# find NAs
carseats.df %>%
filter_all(all_vars(is.na(.)))
# convert all character columns to factor
char.to.fctor <- function(df)
df %>%
mutate_if(is.character, ~ factor(.x, levels = (.x %>% table() %>% names())))
carseats.df <- char.to.fctor(carseats.df)
# remove constant variables
constant_cols <- whichAreConstant(carseats.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(carseats.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(carseats.df))
## [1] "whichAreBijection: it took me 0.03s to identify 0 column(s) to drop."
## integer(0)
carseats.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) carseats.df[- bijections_cols] else carseats.df
#----------------------------------------
# a) split the data into train and test
set.seed(1113)
no.of.train <- ceiling(0.7 * nrow(carseats.df))
no.of.test <- nrow(carseats.df) - no.of.train
train.idx <- sample(seq_len(nrow(carseats.df)), no.of.train)
test.idx <- setdiff(seq_len(nrow(carseats.df)), train.idx)
# b) Fit a regression tree , plot the three and interpret the results, testMSE
tree.carseats <- tree(Sales ~ ., carseats.df, subset = train.idx)
plot(tree.carseats)
text(tree.carseats, pretty = 0)
# summary
summary(tree.carseats)
##
## Regression tree:
## tree(formula = Sales ~ ., data = carseats.df, subset = train.idx)
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Age" "Population" "CompPrice"
## [6] "Advertising" "Education" "US"
## Number of terminal nodes: 20
## Residual mean deviance: 1.895 = 492.7 / 260
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.55300 -0.89080 -0.05948 0.00000 0.90790 3.93100
# the tree
tree.carseats
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 280 2005.000 7.341
## 2) ShelveLoc: Bad,Medium 224 1322.000 6.769
## 4) Price < 94.5 41 122.500 9.141
## 8) Age < 34.5 7 20.350 10.730 *
## 9) Age > 34.5 34 80.910 8.815
## 18) Population < 56.5 6 14.780 10.550 *
## 19) Population > 56.5 28 44.200 8.443 *
## 5) Price > 94.5 183 916.800 6.237
## 10) ShelveLoc: Bad 51 191.400 4.717
## 20) Population < 180.5 19 59.460 3.535
## 40) Price < 120.5 11 20.570 4.463 *
## 41) Price > 120.5 8 16.390 2.259 *
## 21) Population > 180.5 32 89.630 5.419
## 42) Age < 63.5 21 59.040 6.035
## 84) Price < 130 12 22.060 6.998 *
## 85) Price > 130 9 11.010 4.751 *
## 43) Age > 63.5 11 7.378 4.242 *
## 11) ShelveLoc: Medium 132 562.000 6.824
## 22) Age < 47.5 45 133.000 8.001
## 44) Price < 142.5 36 78.760 8.527 *
## 45) Price > 142.5 9 4.319 5.894 *
## 23) Age > 47.5 87 334.500 6.216
## 46) Price < 137.5 74 202.300 6.562
## 92) CompPrice < 123.5 33 36.730 5.587 *
## 93) CompPrice > 123.5 41 109.000 7.346
## 186) Price < 115.5 14 15.890 8.686 *
## 187) Price > 115.5 27 54.940 6.652 *
## 47) Price > 137.5 13 72.920 4.245
## 94) Advertising < 2.5 7 22.520 2.747 *
## 95) Advertising > 2.5 6 16.360 5.993 *
## 3) ShelveLoc: Good 56 315.700 9.632
## 6) Price < 135 42 155.400 10.420
## 12) Education < 12.5 17 41.740 11.700 *
## 13) Education > 12.5 25 66.690 9.545
## 26) US: No 8 3.523 7.818 *
## 27) US: Yes 17 28.050 10.360 *
## 7) Price > 135 14 56.770 7.276
## 14) Age < 62.5 9 26.320 8.244 *
## 15) Age > 62.5 5 6.844 5.534 *
# let's draw the residuals pdf, very close to standard normal
plot(density(summary(tree.carseats)$residuals))
# deviance is sum of squared error for the tree
# do the predoiction on test data
pred.values <- predict(tree.carseats, newdata = carseats.df[test.idx,])
obs.values <- carseats.df[test.idx,]$Sales
# R sruared:
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.387
# testMSE:
(test.MSE <- mean((obs.values - pred.values)^2))
## [1] 6.266808
# now plot the observed vs predicted on test data
plot.data <- data.frame("Pred.values" = pred.values, "obs.values" = obs.values,
"DataSet" = "test")
residuals <- obs.values - pred.values
plot.residuals <- data.frame("x" = obs.values, "y" = pred.values,
"x1" = obs.values, "y2" = pred.values + residuals,
"DataSet" = "test")
ggplot() +
# plot test samples
geom_point(data = plot.data,
aes(x = obs.values, y = pred.values, color = DataSet)) +
# plot residuals
geom_segment(data = plot.residuals, alpha = 0.2,
aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
# plot optimal regressor
geom_abline(color = "blue", slope = 1)
# ----------------------------------
# c) Let's use CV to prune th tree
cv.carseats <- cv.tree(object = tree.carseats, FUN = prune.tree, K = 10)
# Number of terminal nodes of each tree that CV considered
cv.carseats$size
## [1] 20 18 17 16 14 13 12 11 10 9 8 7 6 5 4 3 2 1
# error rate corresponding to each tree that CV considered
cv.carseats$dev
## [1] 1301.257 1301.464 1296.063 1285.945 1280.598 1298.562 1304.273 1294.526
## [9] 1302.293 1374.875 1395.500 1431.839 1424.860 1435.591 1445.018 1592.663
## [17] 1899.638 2030.819
# value of cost complexity parameer alpha that corresponds to each tree considered by CV
cv.carseats$k
## [1] -Inf 21.57798 22.49797 23.61378 24.59477 34.04505 35.11703
## [8] 38.13740 42.31567 46.93310 49.90694 56.59542 59.32975 94.50419
## [15] 103.55720 163.36062 282.58758 367.28168
# creata a tibble to have these values in one place
(cv.result <- tibble(error.rate = cv.carseats$dev, tree.size = cv.carseats$size,
aplpha = cv.carseats$k)
)
cv.result[which.min(cv.result$error.rate), ]
# plot error rate against size and k
plot(cv.carseats$size, cv.carseats$dev, type = 'b')
plot(cv.carseats$k, cv.carseats$dev, type = 'b')
# seems like tree with 14 terminals results in minimum cv error rate
prune.carseats <- prune.tree(tree.carseats, best = 14)
#original data
str(carseats.df)
## Classes 'data.table' and 'data.frame': 400 obs. of 11 variables:
## $ Sales : num 9.5 11.22 10.06 7.4 4.15 ...
## $ CompPrice : int 138 111 113 117 141 124 115 136 132 132 ...
## $ Income : int 73 48 35 100 64 113 105 81 110 113 ...
## $ Advertising: int 11 16 10 4 3 13 0 15 0 0 ...
## $ Population : int 276 260 269 466 340 501 45 425 108 131 ...
## $ Price : int 120 83 80 97 128 72 108 120 124 124 ...
## $ ShelveLoc : Factor w/ 3 levels "Bad","Good","Medium": 1 2 3 3 1 1 3 2 3 3 ...
## $ Age : int 42 65 59 55 38 78 71 67 76 76 ...
## $ Education : int 17 10 12 14 13 16 15 10 10 17 ...
## $ Urban : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 2 2 1 1 ...
## $ US : Factor w/ 2 levels "No","Yes": 2 2 2 2 1 2 1 2 1 2 ...
#pruned tree
summary(prune.carseats)
##
## Regression tree:
## snip.tree(tree = tree.carseats, nodes = c(4L, 20L, 7L, 21L))
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Population" "Age" "CompPrice"
## [6] "Advertising" "Education" "US"
## Number of terminal nodes: 14
## Residual mean deviance: 2.373 = 631.2 / 266
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.6960 -1.0080 -0.1319 0.0000 1.0450 4.3440
plot(prune.carseats)
text(prune.carseats, pretty = 0)
# do the predoiction on test data
pred.values <- predict(prune.carseats, newdata = carseats.df[test.idx,])
obs.values <- carseats.df[test.idx,]$Sales
# R sruared (pruned tree):
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.391
# testMSE (pruned tree) :
(test.MSE <- mean((obs.values - pred.values)^2))
## [1] 6.160287
# now plot the observed vs predicted on test data
plot.data <- data.frame("Pred.values" = pred.values, "obs.values" = obs.values,
"DataSet" = "test")
residuals <- obs.values - pred.values
plot.residuals <- data.frame("x" = obs.values, "y" = pred.values,
"x1" = obs.values, "y2" = pred.values + residuals,
"DataSet" = "test")
ggplot() +
# plot test samples
geom_point(data = plot.data,
aes(x = obs.values, y = pred.values, color = DataSet)) +
# plot residuals
geom_segment(data = plot.residuals, alpha = 0.2,
aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
# plot optimal regressor
geom_abline(color = "blue", slope = 1)
# pruning shows slight improvement on R squared and Test.MSE
# ---------------------------------------------
# d) Use bagging approach
bag.carseats <- randomForest(Sales ~ ., data = carseats.df,subset = train.idx,
mtry=ncol(carseats.df)-1, importance=T)
# importance of each predictor
# % IncMSE : Increase in out of bag MSE When given predictor excluded from model
# IncNodePurity : Decrease on training RSS (regression) or deviance (classification)
# in each split
importance(bag.carseats)
## %IncMSE IncNodePurity
## CompPrice 28.5214943 196.214712
## Income 5.9836079 85.997845
## Advertising 14.8622966 100.626687
## Population 4.2421305 91.249508
## Price 68.1821119 671.421375
## ShelveLoc 61.0398841 503.742273
## Age 25.4366228 232.470799
## Education 0.4686614 55.718900
## Urban -1.0075799 8.037247
## US 8.3649741 19.585203
# plot importance predictors
varImpPlot(bag.carseats)
# do the predoiction on test data
pred.values <- predict(bag.carseats, newdata = carseats.df[test.idx,])
obs.values <- carseats.df[test.idx,]$Sales
# R sruared (pruned tree):
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.677
# testMSE (pruned tree) :
(test.MSE <- mean((obs.values - pred.values)^2))
## [1] 3.704952
# now plot the observed vs predicted on test data
plot.data <- data.frame("pred.values" = pred.values, "obs.values" = obs.values,
"DataSet" = "test")
residuals <- obs.values - pred.values
plot.residuals <- data.frame("x" = obs.values, "y" = pred.values,
"x1" = obs.values, "y2" = pred.values + residuals,
"DataSet" = "test")
ggplot() +
# plot test samples
geom_point(data = plot.data,
aes(x = obs.values, y = pred.values, color = DataSet)) +
# plot residuals
geom_segment(data = plot.residuals, alpha = 0.2,
aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
# plot optimal regressor
geom_abline(color = "blue", slope = 1)
# plot the residuals
plot(density(residuals))
# ---------------------------------------------
# e) Use Random Forest
# ------------------------ Random forest ---------------------------------
# By Default :
# mtry = sqrt(Number of features) For classification
# mtry = one third of number of features For Regression
# first let's find optimal number of trees
plot.data.trees <- tibble("No.Of.Trees" = NULL, "test.MSE" = NULL, "R^2" = NULL)
for (no.of.trees in 10:100){
rf.carseats <- randomForest(Sales ~ ., data = carseats.df,subset = train.idx,
n.trees = no.of.trees, importance=F)
pred.values <- predict(rf.carseats, newdata = carseats.df[test.idx,])
obs.values <- carseats.df[test.idx,]$Sales
R.squared <- round((cor(pred.values, obs.values) ^ 2), 3)
test.MSE <- mean((obs.values - pred.values)^2)
plot.data.trees <- rbind(plot.data.trees,
tibble("No.Of.Trees" = no.of.trees,
"test.MSE" = test.MSE,
"R^2" = R.squared))
}
ggplot() +
geom_line(data = plot.data.trees,
aes(x = No.Of.Trees, y = test.MSE)) +
labs(title="No of trees and test MSE")
# Number of trees that makes test.MSE minimim in range of 100 trees
plot.data.trees[which.min(plot.data.trees$test.MSE), ]
# Let's find number of predictors to build the tree
# --------------------------- mtry ------------------------------- #
plot.data.mtry <- tibble("mtry" = NULL, "test.MSE" = NULL, "R^2" = NULL)
for (m.try in 2:ncol(carseats.df) - 1){
rf.carseats <- randomForest(Sales ~ ., data = carseats.df,subset = train.idx,
mtry = m.try, importance=F)
pred.values <- predict(rf.carseats, newdata = carseats.df[test.idx,])
obs.values <- carseats.df[test.idx,]$Sales
R.squared <- round((cor(pred.values, obs.values) ^ 2), 3)
test.MSE <- mean((obs.values - pred.values)^2)
plot.data.mtry <- rbind(plot.data.mtry,
tibble("mtry" = m.try,
"test.MSE" = test.MSE,
"R^2" = R.squared))
}
ggplot() +
geom_line(data = plot.data.mtry,
aes(x = mtry, y = test.MSE)) +
labs(title="mtry and test MSE")
# value of mtry that makes test.MSE minimim in range of 100 trees
plot.data.mtry[which.min(plot.data.mtry$test.MSE), ]
# over all the best result obtained for number of predictors = 8 and No of trees = 33:
rf.carseats <- randomForest(Sales ~ ., data = carseats.df,subset = train.idx,
mtry = 8, n.trees = 33, importance=F)
pred.values <- predict(rf.carseats, newdata = carseats.df[test.idx,])
obs.values <- carseats.df[test.idx,]$Sales
# R sruared:
(R.squared <- round((cor(pred.values, obs.values) ^ 2), 3))
## [1] 0.687
# testMSE:
(test.MSE <- mean((obs.values - pred.values)^2))
## [1] 3.677755
# now plot the observed vs predicted on test data
plot.data <- data.frame("pred.values" = pred.values, "obs.values" = obs.values,
"DataSet" = "test")
residuals <- obs.values - pred.values
plot.residuals <- data.frame("x" = obs.values, "y" = pred.values,
"x1" = obs.values, "y2" = pred.values + residuals,
"DataSet" = "test")
ggplot() +
# plot test samples
geom_point(data = plot.data,
aes(x = obs.values, y = pred.values, color = DataSet)) +
# plot residuals
geom_segment(data = plot.residuals, alpha = 0.2,
aes(x = x, y = y, xend = x1, yend = y2, group = DataSet)) +
# plot optimal regressor
geom_abline(color = "blue", slope = 1)
# importance: how much training RSS is decreaded (node purity increased) on each split
importance(rf.carseats)
## IncNodePurity
## CompPrice 192.082314
## Income 92.591706
## Advertising 115.678146
## Population 94.699812
## Price 646.896851
## ShelveLoc 491.485512
## Age 228.380907
## Education 61.620468
## Urban 7.736238
## US 19.059774
varImpPlot(rf.carseats)
library(tidyverse)
library(tree)
library(randomForest)
library(dataPreparation)
options(warn = 1)
oj.df = read.csv("/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/orange_juice_withmissing.csv",
header=T, stringsAsFactors = F, na.strings = "?")
oj.df <- tibble(oj.df)
#----------------------------------------
# a) split the data into train and test
# lets first split data to 70% train and the rest test
set.seed(1113)
train.idx <- sample(1:nrow(oj.df), 0.7*nrow(oj.df))
test.idx <- setdiff(1:nrow(oj.df), train.idx)
train.df <- oj.df[train.idx, ]
test.df <- oj.df[test.idx, ]
# remove empty characters and NA helper
remove.NA.chars <- function(df)
df %>%
select_all %>%
filter_if(is.character, all_vars(trimws(.) != "" & trimws(.) != "NA"))
# Clean up train data set
train.df <-
train.df %>%
na.omit() %>%
remove.NA.chars()
# remove constant variables
(constant_cols <- whichAreConstant(train.df))
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
## integer(0)
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(train.df))
## [1] "whichAreInDouble: it took me 0.02s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(train.df))
## [1] "whichAreBijection: it took me 0.11s to identify 0 column(s) to drop."
## integer(0)
train.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) train.df[- bijections_cols] else train.df
# Convert characters to numeric
train.df$PriceCH <- as.numeric(as.character(train.df$PriceCH))
train.df$PriceMM <- as.numeric(as.character(train.df$PriceMM))
train.df$DiscCH <- as.numeric(as.character(train.df$DiscCH))
train.df$DiscMM <- as.numeric(as.character(train.df$DiscMM))
train.df$SpecialCH <- as.numeric(as.character(train.df$SpecialCH))
train.df$SpecialMM <- as.numeric(as.character(train.df$SpecialMM))
train.df$LoyalCH <- as.numeric(as.character(train.df$LoyalCH))
train.df$SalePriceCH <- as.numeric(as.character(train.df$SalePriceCH))
train.df$SalePriceMM <- as.numeric(as.character(train.df$SalePriceMM))
train.df$PriceDiff <- as.numeric(as.character(train.df$PriceDiff))
train.df$PctDiscMM <- as.numeric(as.character(train.df$PctDiscMM))
train.df$PctDiscCH <- as.numeric(as.character(train.df$PctDiscCH))
train.df$StoreID <- as.numeric(as.character(train.df$StoreID))
train.df$STORE <- as.numeric(as.character(train.df$STORE))
# remove constant variables
constant_cols <- whichAreConstant(train.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(train.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(train.df))
## [1] "whichAreBijection: STORE is a bijection of StoreID. I put it in drop list."
## [1] "whichAreBijection: it took me 0.06s to identify 1 column(s) to drop."
## [1] 18
train.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) train.df[- bijections_cols] else train.df
# It looks Purchase has only two values "CH" and "MM"
table(train.df$Purchase)
##
## CH MM
## 444 278
# convert all character columns to factor
char.to.fctor <- function(df)
df %>%
mutate_if(is.character, ~ factor(.x, levels = (.x %>% table() %>% names())))
train.df <- char.to.fctor(train.df)
# Purchase is encoded by factor as CH -> 1, MM -> 2
head(train.df$Purchase)
## [1] MM CH CH CH MM MM
## Levels: CH MM
as.integer(head(train.df$Purchase))
## [1] 2 1 1 1 2 2
# --------- do similar clean up for test data ----------------
# remove empty characters and NA helper
remove.NA.chars <- function(df)
df %>%
select_all %>%
filter_if(is.character, all_vars(trimws(.) != "" & trimws(.) != "NA"))
# Finally convert character columns to factor
# Clean up train data set
test.df <-
test.df %>%
na.omit() %>%
remove.NA.chars()
# remove constant variables
(constant_cols <- whichAreConstant(test.df))
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
## integer(0)
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(test.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(test.df))
## [1] "whichAreBijection: it took me 0.09s to identify 0 column(s) to drop."
## integer(0)
test.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) test.df[- bijections_cols] else test.df
# Convert characters to numeric
test.df$PriceCH <- as.numeric(as.character(test.df$PriceCH))
test.df$PriceMM <- as.numeric(as.character(test.df$PriceMM))
test.df$DiscCH <- as.numeric(as.character(test.df$DiscCH))
test.df$DiscMM <- as.numeric(as.character(test.df$DiscMM))
test.df$SpecialCH <- as.numeric(as.character(test.df$SpecialCH))
test.df$SpecialMM <- as.numeric(as.character(test.df$SpecialMM))
test.df$LoyalCH <- as.numeric(as.character(test.df$LoyalCH))
test.df$SalePriceCH <- as.numeric(as.character(test.df$SalePriceCH))
test.df$SalePriceMM <- as.numeric(as.character(test.df$SalePriceMM))
test.df$PriceDiff <- as.numeric(as.character(test.df$PriceDiff))
test.df$PctDiscMM <- as.numeric(as.character(test.df$PctDiscMM))
test.df$PctDiscCH <- as.numeric(as.character(test.df$PctDiscCH))
test.df$StoreID <- as.numeric(as.character(test.df$StoreID))
test.df$STORE <- as.numeric(as.character(test.df$STORE))
# remove constant variables
constant_cols <- whichAreConstant(test.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(test.df))
## [1] "whichAreInDouble: it took me 0.01s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(test.df))
## [1] "whichAreBijection: STORE is a bijection of StoreID. I put it in drop list."
## [1] "whichAreBijection: it took me 0.09s to identify 1 column(s) to drop."
## [1] 18
test.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) test.df[- bijections_cols] else test.df
# convert all character columns to factor
char.to.fctor <- function(df)
df %>%
mutate_if(is.character, ~ factor(.x, levels = (.x %>% table() %>% names())))
test.df <- char.to.fctor(test.df)
# Purchase is encoded by factor as CH -> 1, MM -> 2
head(test.df$Purchase)
## [1] CH MM CH CH CH CH
## Levels: CH MM
as.integer(head(test.df$Purchase))
## [1] 1 2 1 1 1 1
# --------------------------------------------------------------------------------
# b) Fit a classification tree , plot the three and interpret the results, testMSE
# Note since "Purchase" is a factor, tree() API construct a classification tree
tree.oj <- tree(Purchase ~ ., train.df)
# Number of terminal nodes: 7
# Residual mean deviance: 0.7995
# Misclassification error rate: 0.177
summary(tree.oj)
##
## Classification tree:
## tree(formula = Purchase ~ ., data = train.df)
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff" "ListPriceDiff"
## Number of terminal nodes: 8
## Residual mean deviance: 0.7261 = 518.4 / 714
## Misclassification error rate: 0.162 = 117 / 722
# ----------------------------------
# c) the tree
tree.oj
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 722 962.40 CH ( 0.61496 0.38504 )
## 2) LoyalCH < 0.50395 312 371.20 MM ( 0.28205 0.71795 )
## 4) LoyalCH < 0.276142 146 100.90 MM ( 0.10959 0.89041 )
## 8) LoyalCH < 0.035047 52 0.00 MM ( 0.00000 1.00000 ) *
## 9) LoyalCH > 0.035047 94 85.77 MM ( 0.17021 0.82979 ) *
## 5) LoyalCH > 0.276142 166 227.20 MM ( 0.43373 0.56627 )
## 10) PriceDiff < 0.05 67 73.66 MM ( 0.23881 0.76119 ) *
## 11) PriceDiff > 0.05 99 135.50 CH ( 0.56566 0.43434 ) *
## 3) LoyalCH > 0.50395 410 319.50 CH ( 0.86829 0.13171 )
## 6) PriceDiff < -0.39 26 30.29 MM ( 0.26923 0.73077 ) *
## 7) PriceDiff > -0.39 384 234.40 CH ( 0.90885 0.09115 )
## 14) LoyalCH < 0.703993 117 118.70 CH ( 0.79487 0.20513 )
## 28) ListPriceDiff < 0.235 40 54.55 CH ( 0.57500 0.42500 ) *
## 29) ListPriceDiff > 0.235 77 46.91 CH ( 0.90909 0.09091 ) *
## 15) LoyalCH > 0.703993 267 91.71 CH ( 0.95880 0.04120 ) *
# feature split criterion: LoyalCH < 0.0356415
# Number of training data observations in the branch: 44
# deviance : 0.00
# prediction for the branch : MM
# Fractions of observations classification for thr branch ( 0.00000 1.00000 )
#d) plot the tree
# looks the decision is mostly based on differen ranges on LoyalCH and PriceDiff
# not much other predictors matter in decision making
plot(tree.oj)
text(tree.oj, pretty = 0)
# --------------------------
# e) predoiction on test data
# --------------------------
# pred.probs generates below data:
# CH MM
# 1 0.5750000 0.42500000
# 2 0.2388060 0.76119403
# 3 0.9588015 0.04119850
pred.probs <- predict(tree.oj, newdata = test.df)
pred.values <- ifelse(pred.probs[, 1] > pred.probs[, 2], "CH", "MM")
obs.values <- test.df$Purchase
(confusion_table <- table(pred.values, obs.values))
## obs.values
## pred.values CH MM
## CH 171 34
## MM 20 83
# TN: observes as 0 and predicted as 0
true.negative <- confusion_table[1,1]
# TP: observes as 1 and predicted as 1
true.positive <- confusion_table[2,2]
# FP: observes as 0 and predicted as 1
false.positive <- confusion_table[2,1]
# FN: observed as 1 and predicted as 0
false.negative <- confusion_table[1,2]
observations.total <- true.negative + false.negative + true.positive + false.positive
observed.as.0 <- true.negative + false.positive
observed.as.1 <- false.negative + true.positive
# Random guessing Rate
(nullClassifier <- max(observed.as.0 / observations.total, observed.as.1 / observations.total))
## [1] 0.6201299
# classification Rate:
(classification.rate <- mean(pred.values == obs.values))
## [1] 0.8246753
# Test error Rate:
(missclassificationRate <- 1 - classification.rate)
## [1] 0.1753247
# False Positive Rate (or: Type I error , 1 - specificty):
(FP_rates <- false.positive / observed.as.0)
## [1] 0.104712
# True Positive Rate (or: 1 - Type II error , power , sesetivity , recall):
(TP_rates <- true.positive / observed.as.1)
## [1] 0.7094017
# Precision (Accuracy Rate, 1 - false discovery proportion):
# Model predicted that Two people make a purchase but in reality only one did (50%)
(precisions <- true.positive / (true.positive + false.positive))
## [1] 0.8058252
# Specificity
(specificities <- 1 - false.positive/(false.positive + true.negative))
## [1] 0.895288
library(pROC)
pred.probs.roc <- ifelse(pred.probs[, 1] > pred.probs[, 2], pred.probs[, 1], pred.probs[, 2])
roc_obj <- roc(unlist(obs.values), pred.probs.roc)
## Setting levels: control = CH, case = MM
## Setting direction: controls > cases
# Let's draw some AUC plots
plot(roc_obj, legacy.axes = TRUE)
# ------------------------------------
# f) apply cv.tree to prune the tree
# ------------------------------------
cv.oj <- cv.tree(object = tree.oj, FUN = prune.misclass, K = 10)
# Number of terminal nodes of each tree that CV considered
cv.oj$size
## [1] 8 5 3 2 1
# Error rate corresponding to each tree that CV considered
cv.oj$dev
## [1] 135 135 134 144 278
# value of cost complexity parameter alpha that corresponds to each tree considered by CV
cv.oj$k
## [1] -Inf 0.0 6.5 12.0 136.0
# -------------------------------------------------
# g) plot tree size vs CV classification error rate
# --------------------------------------------------
# plot the missclassification error rate as the function of size and k
plot(cv.oj$size, cv.oj$dev, type = 'b')
plot(cv.oj$k, cv.oj$dev, type = 'b')
# ---------------------------------------------------------------------------------------
# h) seems like 3 is the best size of the tree,
# ---------------------------------------------------------------------------------------
#--------------------------------------
# i) let's prune it to get to tree with size 3
# ---------------------------------------------
prune.oj <- prune.misclass(tree.oj, best = 3)
plot(prune.oj)
text(prune.oj, pretty = 0)
# let's run it on test set to see the ,iss classification rate again:
#---------------------------------------------------------------------
# j) compare training error rates between prune and unpruned tree
# --------------------------------------------------------------------
#pruned tree
summary(prune.oj)
##
## Classification tree:
## snip.tree(tree = tree.oj, nodes = c(7L, 2L))
## Variables actually used in tree construction:
## [1] "LoyalCH" "PriceDiff"
## Number of terminal nodes: 3
## Residual mean deviance: 0.8844 = 635.9 / 719
## Misclassification error rate: 0.1801 = 130 / 722
#---------------------------------------------------------------------
# j) compare test error rates between prune and unpruned tree
# --------------------------------------------------------------------
# pred.probs generates below data:
# CH MM
# 1 0.5750000 0.42500000
# 2 0.2388060 0.76119403
# 3 0.9588015 0.04119850
pred.probs <- predict(prune.oj, newdata = test.df)
pred.values <- ifelse(pred.probs[, 1] > pred.probs[, 2], "CH", "MM")
obs.values <- test.df$Purchase
pred.probs <- predict(tree.oj, newdata = test.df)
(confusion_table <- table(pred.values, obs.values))
## obs.values
## pred.values CH MM
## CH 145 19
## MM 46 98
# TN: observes as 0 and predicted as 0
true.negative <- confusion_table[1,1]
# TP: observes as 1 and predicted as 1
true.positive <- confusion_table[2,2]
# FP: observes as 0 and predicted as 1
false.positive <- confusion_table[2,1]
# FN: observed as 1 and predicted as 0
false.negative <- confusion_table[1,2]
observations.total <- true.negative + false.negative + true.positive + false.positive
observed.as.0 <- true.negative + false.positive
observed.as.1 <- false.negative + true.positive
# Random guessing Rate
(nullClassifier <- max(observed.as.0 / observations.total, observed.as.1 / observations.total))
## [1] 0.6201299
# classification Rate:
(classification.rate <- mean(pred.values == obs.values))
## [1] 0.788961
# Test error Rate:
(missclassificationRate <- 1 - classification.rate)
## [1] 0.211039
# False Positive Rate (or: Type I error , 1 - specificty):
(FP_rates <- false.positive / observed.as.0)
## [1] 0.2408377
# True Positive Rate (or: 1 - Type II error , power , sesetivity , recall):
(TP_rates <- true.positive / observed.as.1)
## [1] 0.8376068
# Precision (Accuracy Rate, 1 - false discovery proportion):
# Model predicted that Two people make a purchase but in reality only one did (50%)
(precisions <- true.positive / (true.positive + false.positive))
## [1] 0.6805556
# Specificity
(specificities <- 1 - false.positive/(false.positive + true.negative))
## [1] 0.7591623
library(pROC)
pred.probs.roc <- ifelse(pred.probs[, 1] > pred.probs[, 2], pred.probs[, 1], pred.probs[, 2])
roc_obj <- roc(unlist(obs.values), pred.probs.roc)
## Setting levels: control = CH, case = MM
## Setting direction: controls > cases
# Let's draw some AUC plots
plot(roc_obj, legacy.axes = TRUE)
# pruned tree classifier behaves worse than the full tree:
library(tidyverse)
library(gbm)
library(glmnet) # for Lasso
## Loaded glmnet 3.0-2
library(randomForest)
library(dataPreparation)
options(warn = 1)
set.seed(1311)
hitters.df = read.csv(
"/Users/shahrdadshadab/env/my-R-project/ISLR/Data/datasets/Hitters.csv",
header=T, stringsAsFactors = F, na.strings = "?")
# -------------------------------
#
# note that Salary is of type string and some of them are NA
sum(hitters.df$Salary=="NA")
## [1] 59
hitters.df <- tibble(hitters.df)
# b) first devide the data to 70% train and 30% test
n <- nrow(hitters.df)
no.of.train <- ceiling(0.7*n)
no.of.test <- n - no.of.train
train.idx <- sample(seq_len(n), no.of.train)
test.idx <- setdiff(seq_len(n), train.idx)
train.df <- hitters.df[train.idx, ]
test.df <- hitters.df[test.idx, ]
# a) Cleaning and log transform
#So let's clean train data set
na.train.idx <-
train.df %>%
pmap(~c(...)) %>%
map(~sum(is.na(.) | str_trim(.) == "NA") > 0) %>%
unlist %>%
which()
rest.train.idx <- setdiff(seq_len(no.of.train) , na.train.idx)
train.df <- train.df[rest.train.idx, ]
# remove constant variables
constant_cols <- whichAreConstant(train.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(train.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(train.df))
## [1] "whichAreBijection: League is a bijection of NewLeague. I put it in drop list."
## [1] "whichAreBijection: it took me 0.1s to identify 1 column(s) to drop."
## [1] 14
train.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) train.df[- bijections_cols] else train.df
train.df$Salary <- log(as.numeric(as.character(train.df$Salary)))
# convert all character columns to factor
char.to.fctor <- function(df)
df %>%
mutate_if(is.character, ~ factor(.x, levels = (.x %>% table() %>% names())))
train.df <- char.to.fctor(train.df)
# Now clean test data
na.test.idx <-
test.df %>%
pmap(~c(...)) %>%
map(~sum(is.na(.) | str_trim(.) == "NA") > 0) %>%
unlist %>%
which()
rest.test.idx <- setdiff(seq_len(no.of.test) , na.test.idx)
test.df <- test.df[rest.test.idx, ]
# remove constant variables
constant_cols <- whichAreConstant(test.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(test.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(test.df))
## [1] "whichAreBijection: League is a bijection of NewLeague. I put it in drop list."
## [1] "whichAreBijection: it took me 0.12s to identify 1 column(s) to drop."
## [1] 14
test.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) test.df[- bijections_cols] else test.df
test.df$Salary <- log(as.numeric(as.character(test.df$Salary)))
# convert all character columns to factor
char.to.fctor <- function(df)
df %>%
mutate_if(is.character, ~ factor(.x, levels = (.x %>% table() %>% names())))
test.df <- char.to.fctor(test.df)
# c) perform Regression boosting on training sets with 1000 trees for range of shrinkage parameters
shrinkage.MSE.train.plot <- tibble(lambdas = NULL, train.MSEs = NULL, DataSet=NULL)
shrinkage.MSE.test.plot <- tibble(lambdas = NULL, test.MSEs = NULL, r.squared = NULL, DataSet=NULL)
for (lambda in seq(0.1, 1, len = 100)){
hitters.boost <- gbm(Salary ~ ., data = train.df, distribution="gaussian",
n.trees = 1000, shrinkage=lambda, verbose=F)
# claculate training MSE
yhat.boost.train <- predict(hitters.boost, newdata = train.df, n.trees = 1000)
stopifnot(identical (length(yhat.boost.train) , length(train.df$Salary)) )
shrinkage.MSE.train.plot <- rbind(shrinkage.MSE.train.plot ,
tibble(lambdas = lambda,
train.MSEs = mean((yhat.boost.train - train.df$Salary)^2),
DataSet = "train"))
# claculate test MSE
yhat.boost.test <- predict(hitters.boost, newdata = test.df, n.trees = 1000)
stopifnot(identical (length(yhat.boost.test) , length(test.df$Salary)) )
shrinkage.MSE.test.plot <- rbind(shrinkage.MSE.test.plot ,
tibble(lambdas = lambda,
test.MSEs = mean((yhat.boost.test - test.df$Salary)^2),
r.squared = round((cor(yhat.boost.test, test.df$Salary) ^ 2), 3),
DataSet = "test"))
}
ggplot() +
# plot test samples
geom_line(data = shrinkage.MSE.train.plot,
aes(x = lambdas, y = train.MSEs, color = DataSet)) +
labs(title="Shinkage factor vs. train MSE")
ggplot() +
# plot test samples
geom_line(data = shrinkage.MSE.test.plot,
aes(x = lambdas, y = test.MSEs, color = DataSet)) +
labs(title="Shinkage factor vs. test MSE")
# e) Compare test.mse of boosing with test.mse of linear model and lasso
# find which lambda value causes minimum test.mse
shrinkage.MSE.test.plot %>% slice(which.min(test.MSEs))
# build a linear model on training data and find the test mse
hitters.lm = lm(Salary ~ ., data = train.df)
preds <- predict(hitters.lm, test.df, interval = "prediction")
yhat.lm.test <- preds[,"fit"]
stopifnot(identical (length(yhat.lm.test), length(test.df$Salary)) )
print ("Apply Linear regression:")
## [1] "Apply Linear regression:"
# LM test MSE
(test.MSE = mean((yhat.lm.test - test.df$Salary)^2))
## [1] 0.4539836
# LM R squared
(r.squared = round((cor(yhat.lm.test, test.df$Salary) ^ 2), 3))
## [1] 0.396
# Clearly performance of linear model cannot beat boosing model
print ("Apply Lasso cross validation to find the best lambda and corresponding coeffs")
## [1] "Apply Lasso cross validation to find the best lambda and corresponding coeffs"
# First construct matrix from dataframe (and drop intercept column)
x.train <- model.matrix(Salary~., train.df)[, -1] # drop the intercept
x.test <- model.matrix(Salary~., test.df)[, -1] # drop the intercept
y.train <- train.df$Salary
y.test <- test.df$Salary
cv.out <- cv.glmnet(x.train, y.train, alpha=1) # Lasso
plot(cv.out)
best.lambda <- cv.out$lambda.min
# predict the model on test
pred.lasso <- predict(cv.out, s=best.lambda, newx=x.test)
# Lasso test mse for the best lambda we chose:
(lasso.mse <- mean((pred.lasso - y.test)^2))
## [1] 0.4065154
# R squared for the best lambda we chose:
(lasso.R.squared <- round((cor(pred.lasso, y.test) ^ 2), 3))
## [,1]
## 1 0.427
# f) Most important predictors in boostnig model:
summary (hitters.boost)
# it to what predictors lasso chose:
# Coeffs for Lasso:
(coefs.lasso <- predict (cv.out, type = "coefficients", s = best.lambda))
## 20 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 4.628897e+00
## AtBat -2.083146e-03
## Hits 9.695646e-03
## HmRun 3.981258e-03
## Runs 7.700172e-04
## RBI .
## Walks 8.552672e-03
## Years 4.815029e-02
## CAtBat 3.640107e-05
## CHits .
## CHmRun 9.260731e-04
## CRuns 3.665106e-04
## CRBI .
## CWalks .
## LeagueN 1.923153e-01
## DivisionW -2.065707e-01
## PutOuts 3.612672e-04
## Assists 7.686914e-04
## Errors -1.043280e-02
## NewLeagueN -1.992285e-01
# It is noticable that "CHits" predictor which is the most important
# predictor in boosting model is dropped by Lasso
# g) apply bagging to training set and calculate testMSE and R squared
hitters.bag = randomForest(Salary ~ ., data = train.df, mtry=ncol(train.df) - 1, importance=T)
hitters.bag
##
## Call:
## randomForest(formula = Salary ~ ., data = train.df, mtry = ncol(train.df) - 1, importance = T)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 19
##
## Mean of squared residuals: 0.1724127
## % Var explained: 79.67
yhat.bag <- predict(hitters.bag, newdata = test.df)
# bagging test mse:
(bag.mse <- mean((yhat.bag - test.df$Salary)^2))
## [1] 0.2377224
# R squared for testMSE:
(bag.R.squared <- round((cor(yhat.bag, test.df$Salary) ^ 2), 3))
## [1] 0.637
# surprisingly bagging produced better result than gradiant boosting
library(tidyverse)
library(gbm)
library(glmnet) # for Lasso
library(randomForest)
library(dataPreparation)
library(class)
options(warn = 1)
set.seed(1113)
caravan.df = read.csv("/Users/shahrdadshadab/env/my-R-project/ISLR/Data/Caravan.csv", header=T,
stringsAsFactors = F, na.strings = "?")
caravan.df = as_tibble(caravan.df)
# remove constant variables
constant_cols <- whichAreConstant(caravan.df)
## [1] "whichAreConstant: it took me 0.01s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(caravan.df))
## [1] "whichAreInDouble: it took me 0.1s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(caravan.df))
## [1] "whichAreBijection: it took me 1.95s to identify 0 column(s) to drop."
## integer(0)
caravan.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 ) caravan.df[- bijections_cols] else caravan.df
nrow(caravan.df)
## [1] 5822
caravan.df %>%
filter_all(all_vars(is.na(.)))
# No NA is in data set
# Since boosting API with 'benoulli' distribution needs 0 and 1 values
# we cannot use facotr, instead we should manually encode them
caravan.df$Purchase <- ifelse(caravan.df$Purchase == "Yes", 1, 0)
str(caravan.df)
## Classes 'data.table' and 'data.frame': 5822 obs. of 87 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MOSTYPE : int 33 37 37 9 40 23 39 33 33 11 ...
## $ MAANTHUI: int 1 1 1 1 1 1 2 1 1 2 ...
## $ MGEMOMV : int 3 2 2 3 4 2 3 2 2 3 ...
## $ MGEMLEEF: int 2 2 2 3 2 1 2 3 4 3 ...
## $ MOSHOOFD: int 8 8 8 3 10 5 9 8 8 3 ...
## $ MGODRK : int 0 1 0 2 1 0 2 0 0 3 ...
## $ MGODPR : int 5 4 4 3 4 5 2 7 1 5 ...
## $ MGODOV : int 1 1 2 2 1 0 0 0 3 0 ...
## $ MGODGE : int 3 4 4 4 4 5 5 2 6 2 ...
## $ MRELGE : int 7 6 3 5 7 0 7 7 6 7 ...
## $ MRELSA : int 0 2 2 2 1 6 2 2 0 0 ...
## $ MRELOV : int 2 2 4 2 2 3 0 0 3 2 ...
## $ MFALLEEN: int 1 0 4 2 2 3 0 0 3 2 ...
## $ MFGEKIND: int 2 4 4 3 4 5 3 5 3 2 ...
## $ MFWEKIND: int 6 5 2 4 4 2 6 4 3 6 ...
## $ MOPLHOOG: int 1 0 0 3 5 0 0 0 0 0 ...
## $ MOPLMIDD: int 2 5 5 4 4 5 4 3 1 4 ...
## $ MOPLLAAG: int 7 4 4 2 0 4 5 6 8 5 ...
## $ MBERHOOG: int 1 0 0 4 0 2 0 2 1 2 ...
## $ MBERZELF: int 0 0 0 0 5 0 0 0 1 0 ...
## $ MBERBOER: int 1 0 0 0 4 0 0 0 0 0 ...
## $ MBERMIDD: int 2 5 7 3 0 4 4 2 1 3 ...
## $ MBERARBG: int 5 0 0 1 0 2 1 5 8 3 ...
## $ MBERARBO: int 2 4 2 2 0 2 5 2 1 3 ...
## $ MSKA : int 1 0 0 3 9 2 0 2 1 1 ...
## $ MSKB1 : int 1 2 5 2 0 2 1 1 1 2 ...
## $ MSKB2 : int 2 3 0 1 0 2 4 2 0 1 ...
## $ MSKC : int 6 5 4 4 0 4 5 5 8 4 ...
## $ MSKD : int 1 0 0 0 0 2 0 2 1 2 ...
## $ MHHUUR : int 1 2 7 5 4 9 6 0 9 0 ...
## $ MHKOOP : int 8 7 2 4 5 0 3 9 0 9 ...
## $ MAUT1 : int 8 7 7 9 6 5 8 4 5 6 ...
## $ MAUT2 : int 0 1 0 0 2 3 0 4 2 1 ...
## $ MAUT0 : int 1 2 2 0 1 3 1 2 3 2 ...
## $ MZFONDS : int 8 6 9 7 5 9 9 6 7 6 ...
## $ MZPART : int 1 3 0 2 4 0 0 3 2 3 ...
## $ MINKM30 : int 0 2 4 1 0 5 4 2 7 2 ...
## $ MINK3045: int 4 0 5 5 0 2 3 5 2 3 ...
## $ MINK4575: int 5 5 0 3 9 3 3 3 1 3 ...
## $ MINK7512: int 0 2 0 0 0 0 0 0 0 1 ...
## $ MINK123M: int 0 0 0 0 0 0 0 0 0 0 ...
## $ MINKGEM : int 4 5 3 4 6 3 3 3 2 4 ...
## $ MKOOPKLA: int 3 4 4 4 3 3 5 3 3 7 ...
## $ PWAPART : int 0 2 2 0 0 0 0 0 0 2 ...
## $ PWABEDR : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PWALAND : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PPERSAUT: int 6 0 6 6 0 6 6 0 5 0 ...
## $ PBESAUT : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PMOTSCO : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PVRAAUT : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PAANHANG: int 0 0 0 0 0 0 0 0 0 0 ...
## $ PTRACTOR: int 0 0 0 0 0 0 0 0 0 0 ...
## $ PWERKT : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PBROM : int 0 0 0 0 0 0 0 3 0 0 ...
## $ PLEVEN : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PPERSONG: int 0 0 0 0 0 0 0 0 0 0 ...
## $ PGEZONG : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PWAOREG : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PBRAND : int 5 2 2 2 6 0 0 0 0 3 ...
## $ PZEILPL : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PPLEZIER: int 0 0 0 0 0 0 0 0 0 0 ...
## $ PFIETS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PINBOED : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PBYSTAND: int 0 0 0 0 0 0 0 0 0 0 ...
## $ AWAPART : int 0 2 1 0 0 0 0 0 0 1 ...
## $ AWABEDR : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AWALAND : int 0 0 0 0 0 0 0 0 0 0 ...
## $ APERSAUT: int 1 0 1 1 0 1 1 0 1 0 ...
## $ ABESAUT : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AMOTSCO : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AVRAAUT : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AAANHANG: int 0 0 0 0 0 0 0 0 0 0 ...
## $ ATRACTOR: int 0 0 0 0 0 0 0 0 0 0 ...
## $ AWERKT : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ABROM : int 0 0 0 0 0 0 0 1 0 0 ...
## $ ALEVEN : int 0 0 0 0 0 0 0 0 0 0 ...
## $ APERSONG: int 0 0 0 0 0 0 0 0 0 0 ...
## $ AGEZONG : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AWAOREG : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ABRAND : int 1 1 1 1 1 0 0 0 0 1 ...
## $ AZEILPL : int 0 0 0 0 0 0 0 0 0 0 ...
## $ APLEZIER: int 0 0 0 0 0 0 0 0 0 0 ...
## $ AFIETS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AINBOED : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ABYSTAND: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Purchase: num 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
# a) let's creata 1000 observations in training set and the rest in test set:
(n <- nrow(caravan.df))
## [1] 5822
train.idx <- sample(seq_len(n), 1000)
test.idx <- setdiff(seq_len(n), train.idx)
stopifnot(identical(length(train.idx) + length(test.idx), n))
train.df <- caravan.df[train.idx, ]
test.df <- caravan.df[test.idx, ]
# b) Fit a boosting model and find the most important variable
caravan.boost <- gbm(Purchase ~ . , data = train.df, distribution="bernoulli",
n.trees = 1000, shrinkage=0.01, verbose=F)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution, :
## variable 61: PZEILPL has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution, :
## variable 82: AZEILPL has no variation.
# MSKC is the most important predictor
summary(caravan.boost)
# c) predict that a person will make a purchase if estimated probability of purchase > 0.2
prob.boost.test <- predict(caravan.boost, newdata = test.df, n.trees = 1000)
# Since we encoded Yes as 1 and No as 0 we have
yhat.boost.test <- ifelse(prob.boost.test > 0.2, 1, 0)
stopifnot(identical(length(prob.boost.test),length(test.df$Purchase)))
#Confusion Matrix:
(confusion_table <- table(yhat.boost.test, test.df$Purchase))
##
## yhat.boost.test 0 1
## 0 4524 296
## 1 1 1
# TN: observes as 0 and predicted as 0 (No of people that predicted Not to make any purchase and they really did not)
true.negative <- confusion_table[1,1]
# TP: observes as 1 and predicted as 1 (No of people that predicted to make a purchase and in reality they did)
true.positive <- confusion_table[2,2]
# FP: observes as 0 and predicted as 1 (No of people that predicted to make a purchase but they did not)
false.positive <- confusion_table[2,1]
# FN: observed as 1 and predicted as 0 (No of people that predicted they did not make a purchase but they in realty did)
false.negative <- confusion_table[1,2]
observations.total <- true.negative + false.negative + true.positive + false.positive
observed.as.0 <- true.negative + false.positive
observed.as.1 <- false.negative + true.positive
# Random guessing Rate
(nullClassifier <- max(observed.as.0 / observations.total, observed.as.1 / observations.total))
## [1] 0.9384073
# classification Rate:
(classification.rate <- mean(yhat.boost.test == test.df$Purchase))
## [1] 0.9384073
# Test error Rate:
(missclassificationRate <- 1 - classification.rate)
## [1] 0.0615927
# False Positive Rate (or: Type I error , 1 - specificty):
(FP_rates <- false.positive / observed.as.0)
## [1] 0.0002209945
# True Positive Rate (or: 1 - Type II error , power , sesetivity , recall):
(TP_rates <- true.positive / observed.as.1)
## [1] 0.003367003
# Precision (Accuracy Rate, 1 - false discovery proportion):
# Model predicted that Two people make a purchase but in reality only one did (50%)
(precisions <- true.positive / (true.positive + false.positive))
## [1] 0.5
# Specificity
(specificities <- 1 - false.positive/(false.positive + true.negative))
## [1] 0.999779
library(pROC)
roc_obj <- roc(unlist(test.df$Purchase), prob.boost.test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Let's draw some AUC plots
plot(roc_obj, legacy.axes = TRUE)
# -------------------------------------------------------------
# Let's use logiostic regression to estimate what percentage of
# people who predicted to make a purchase and they really did
glm.model <- glm(Purchase ~ . , data = train.df, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm.model)
##
## Call:
## glm(formula = Purchase ~ ., family = binomial, data = train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5983 -0.2918 -0.1463 -0.0555 3.3308
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.645e+02 6.732e+04 0.004 0.9969
## X 3.404e-05 1.043e-04 0.327 0.7440
## MOSTYPE 1.421e-01 1.327e-01 1.070 0.2844
## MAANTHUI -9.558e-01 7.816e-01 -1.223 0.2214
## MGEMOMV 2.900e-01 4.152e-01 0.698 0.4849
## MGEMLEEF 6.166e-01 3.394e-01 1.817 0.0692 .
## MOSHOOFD -7.014e-01 5.946e-01 -1.180 0.2382
## MGODRK -1.032e-01 3.303e-01 -0.312 0.7548
## MGODPR 4.260e-02 3.535e-01 0.121 0.9041
## MGODOV -3.587e-02 3.140e-01 -0.114 0.9091
## MGODGE -1.753e-01 3.260e-01 -0.538 0.5907
## MRELGE 3.549e-01 4.685e-01 0.757 0.4488
## MRELSA 4.875e-01 4.441e-01 1.098 0.2724
## MRELOV 5.770e-01 4.565e-01 1.264 0.2063
## MFALLEEN -2.331e-02 3.901e-01 -0.060 0.9524
## MFGEKIND 1.634e-01 4.040e-01 0.404 0.6859
## MFWEKIND 9.659e-02 4.198e-01 0.230 0.8180
## MOPLHOOG -3.250e-01 3.791e-01 -0.857 0.3912
## MOPLMIDD -3.506e-01 4.118e-01 -0.852 0.3945
## MOPLLAAG -6.335e-01 4.154e-01 -1.525 0.1273
## MBERHOOG 1.507e-01 2.906e-01 0.519 0.6039
## MBERZELF 2.793e-01 3.195e-01 0.874 0.3821
## MBERBOER 1.331e-01 3.017e-01 0.441 0.6592
## MBERMIDD 3.781e-01 2.652e-01 1.426 0.1539
## MBERARBG 2.778e-01 2.721e-01 1.021 0.3072
## MBERARBO 3.859e-01 2.707e-01 1.426 0.1539
## MSKA 5.966e-01 3.372e-01 1.769 0.0769 .
## MSKB1 3.270e-01 3.041e-01 1.075 0.2823
## MSKB2 2.533e-01 2.700e-01 0.938 0.3482
## MSKC 5.236e-01 3.104e-01 1.687 0.0917 .
## MSKD 4.964e-01 2.958e-01 1.678 0.0934 .
## MHHUUR -1.506e+01 5.107e+03 -0.003 0.9976
## MHKOOP -1.499e+01 5.107e+03 -0.003 0.9977
## MAUT1 -3.144e-01 4.426e-01 -0.710 0.4775
## MAUT2 -2.313e-01 3.972e-01 -0.582 0.5604
## MAUT0 -3.000e-01 4.414e-01 -0.680 0.4967
## MZFONDS -1.546e+01 5.465e+03 -0.003 0.9977
## MZPART -1.548e+01 5.465e+03 -0.003 0.9977
## MINKM30 -2.913e-01 3.047e-01 -0.956 0.3390
## MINK3045 -8.679e-02 2.964e-01 -0.293 0.7697
## MINK4575 -3.181e-01 3.008e-01 -1.058 0.2902
## MINK7512 -1.737e-01 2.935e-01 -0.592 0.5539
## MINK123M -2.350e-01 4.364e-01 -0.539 0.5902
## MINKGEM 1.818e-01 2.678e-01 0.679 0.4972
## MKOOPKLA 1.381e-01 1.461e-01 0.946 0.3443
## PWAPART 1.792e+01 2.584e+03 0.007 0.9945
## PWABEDR -1.807e+00 4.826e+03 0.000 0.9997
## PWALAND -3.753e+01 3.604e+03 -0.010 0.9917
## PPERSAUT 1.238e-01 1.374e-01 0.901 0.3675
## PBESAUT 5.051e+00 4.702e+03 0.001 0.9991
## PMOTSCO 5.868e-01 1.061e+00 0.553 0.5802
## PVRAAUT -3.158e+00 2.955e+03 -0.001 0.9991
## PAANHANG -1.459e+00 9.047e+03 0.000 0.9999
## PTRACTOR -1.164e+01 9.330e+02 -0.012 0.9900
## PWERKT -5.459e+01 1.941e+04 -0.003 0.9978
## PBROM -2.385e+00 1.783e+00 -1.338 0.1810
## PLEVEN 9.380e-02 2.555e-01 0.367 0.7135
## PPERSONG 6.381e-01 5.150e+03 0.000 0.9999
## PGEZONG 1.966e+01 9.139e+03 0.002 0.9983
## PWAOREG -2.737e+00 4.234e+03 -0.001 0.9995
## PBRAND 3.877e-01 2.050e-01 1.891 0.0586 .
## PZEILPL NA NA NA NA
## PPLEZIER 1.173e+00 1.250e+04 0.000 0.9999
## PFIETS -2.387e-01 2.819e+00 -0.085 0.9325
## PINBOED 8.740e+00 9.541e+03 0.001 0.9993
## PBYSTAND -1.609e+00 1.244e+00 -1.293 0.1960
## AWAPART -3.558e+01 5.168e+03 -0.007 0.9945
## AWABEDR -1.273e+01 1.151e+04 -0.001 0.9991
## AWALAND 5.998e+01 9.722e+03 0.006 0.9951
## APERSAUT 4.426e-01 6.197e-01 0.714 0.4751
## ABESAUT -2.405e+01 1.604e+04 -0.001 0.9988
## AMOTSCO -2.733e+00 4.816e+00 -0.568 0.5704
## AVRAAUT NA NA NA NA
## AAANHANG -1.490e+01 1.535e+04 -0.001 0.9992
## ATRACTOR 3.659e+01 2.799e+03 0.013 0.9896
## AWERKT 9.620e+01 3.767e+04 0.003 0.9980
## ABROM 5.302e+00 3.583e+00 1.480 0.1389
## ALEVEN -1.476e-01 7.088e-01 -0.208 0.8351
## APERSONG -1.743e+01 1.643e+04 -0.001 0.9992
## AGEZONG -5.760e+01 2.742e+04 -0.002 0.9983
## AWAOREG -1.288e+00 2.192e+04 0.000 1.0000
## ABRAND -4.226e-01 7.520e-01 -0.562 0.5741
## AZEILPL NA NA NA NA
## APLEZIER -2.323e+01 4.495e+04 -0.001 0.9996
## AFIETS 1.134e+00 1.870e+00 0.606 0.5442
## AINBOED -2.485e+01 1.558e+04 -0.002 0.9987
## ABYSTAND 6.640e+00 4.463e+00 1.488 0.1368
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 402.90 on 999 degrees of freedom
## Residual deviance: 287.16 on 916 degrees of freedom
## AIC: 455.16
##
## Number of Fisher Scoring iterations: 19
# do the prediction using the model
preds <- predict(glm.model, newdata=test.df, se = T)
## Warning in predict.lm(object, newdata, se.fit, scale = residual.scale, type = if
## (type == : prediction from a rank-deficient fit may be misleading
# Note that predict function for glm model gets the prediction for logit i.e:
# log(Pr(Y=1 | X) / (1 - Pr(Y=1 | X))) = X*beta and also the standard error is
# of the same form, clealry to get prediction for Pr(Y=1|X) we need
# to calculate exp(X*beta)/(1+exp(X*beta))
prob.predict <- exp(preds$fit)/(1+exp(preds$fit))
se.bound.logit <- cbind(preds$fit + 2*preds$se, preds$fit - 2*preds$se)
se.bound <- apply(se.bound.logit, 2, function(x) exp(x)/(1+exp(x)))
yhat.logReg.test <- ifelse(prob.predict > 0.2, 1, 0)
stopifnot(identical(length(prob.predict),length(test.df$Purchase)))
#Confusion Matrix:
(confusion_table <- table(yhat.logReg.test, test.df$Purchase))
##
## yhat.logReg.test 0 1
## 0 4214 240
## 1 311 57
# TN: observes as 0 and predicted as 0 (No of people that predicted Not to make any purchase and they really did not)
true.negative <- confusion_table[1,1]
# TP: observes as 1 and predicted as 1 (No of people that predicted to make a purchase and in reality they did)
true.positive <- confusion_table[2,2]
# FP: observes as 0 and predicted as 1 (No of people that predicted to make a purchase but they did not)
false.positive <- confusion_table[2,1]
# FN: observed as 1 and predicted as 0 (No of people that predicted they did not make a purchase but they in realty did)
false.negative <- confusion_table[1,2]
observations.total <- true.negative + false.negative + true.positive + false.positive
observed.as.0 <- true.negative + false.positive
observed.as.1 <- false.negative + true.positive
# Random guessing Rate
(nullClassifier <- max(observed.as.0 / observations.total, observed.as.1 / observations.total))
## [1] 0.9384073
# classification Rate:
(classification.rate <- mean(yhat.logReg.test == test.df$Purchase))
## [1] 0.8857321
# Test error Rate:
(missclassificationRate <- 1 - classification.rate)
## [1] 0.1142679
# False Positive Rate (or: Type I error , 1 - specificty):
(FP_rates <- false.positive / observed.as.0)
## [1] 0.06872928
# True Positive Rate (or: 1 - Type II error , power , sesetivity , recall):
(TP_rates <- true.positive / observed.as.1)
## [1] 0.1919192
# Precision (Accuracy Rate, 1 - false discovery proportion):
# Model predicted that 311+57 people make a purchase but in reality only 57 people did (15%)
(precisions <- true.positive / (true.positive + false.positive))
## [1] 0.1548913
# Specificity
(specificities <- 1 - false.positive/(false.positive + true.negative))
## [1] 0.9312707
library(pROC)
roc_obj <- roc(unlist(test.df$Purchase), yhat.logReg.test)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Let's draw some AUC plots
plot(roc_obj, legacy.axes = TRUE)
# Precision drop down from 50% in gradiant boosting to 15% in logistic regression
library(tidyverse)
library(gbm) # for original implementation of regular and stochastic GBMs
library(h2o) # for a java-based implementation of GBM variants
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
## The following object is masked from 'package:pROC':
##
## var
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(xgboost) # for fitting extreme gradient boosting
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(glmnet) # for Lasso
library(randomForest)
library(dataPreparation)
library(class)
library(doParallel) # for parallel backend to foreach
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
## Loading required package: parallel
library(foreach) # for parallel processing with for loops
# Modeling packages
library(caret) # for general model fitting
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(rpart) # for fitting decision trees
library(ipred)
options(warn = 1)
set.seed(1113)
smarket.df =
read.csv("/Users/shahrdadshadab/env/my-R-project/ISLR/Data/Smarket.csv",
header=T, stringsAsFactors = F, na.strings = "?")
str(smarket.df)
## 'data.frame': 1250 obs. of 10 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Year : int 2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
## $ Lag1 : num 0.381 0.959 1.032 -0.623 0.614 ...
## $ Lag2 : num -0.192 0.381 0.959 1.032 -0.623 ...
## $ Lag3 : num -2.624 -0.192 0.381 0.959 1.032 ...
## $ Lag4 : num -1.055 -2.624 -0.192 0.381 0.959 ...
## $ Lag5 : num 5.01 -1.055 -2.624 -0.192 0.381 ...
## $ Volume : num 1.19 1.3 1.41 1.28 1.21 ...
## $ Today : num 0.959 1.032 -0.623 0.614 0.213 ...
## $ Direction: chr "Up" "Up" "Down" "Up" ...
# remove constant variables
constant_cols <- whichAreConstant(smarket.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(smarket.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(smarket.df))
## [1] "whichAreBijection: it took me 0.06s to identify 0 column(s) to drop."
## integer(0)
smarket.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 )
smarket.df[- bijections_cols] else smarket.df
smarket.df %>%
filter_all(all_vars(is.na(.)))
# make sure no "NA is in the charachter column Direction"
(na.count <- smarket.df %>%
filter(Direction == "NA") %>%
nrow)
## [1] 0
stopifnot(na.count != numeric(0))
# make sure all are Either "Up" of #Down"
ups <- smarket.df %>%
filter(Direction == "Up") %>%
nrow
downs <- smarket.df %>%
filter(Direction == "Down") %>%
nrow
stopifnot(identical(ups+downs, nrow(smarket.df)))
# Since boosting API with 'benoulli' distribution needs 0 and 1 values
# we cannot use facotr, instead we should manually encode them
smarket.df$Direction.01 <- ifelse(smarket.df$Direction == "Up", 1, 0)
smarket.df$Direction <- as.factor(smarket.df$Direction)
str(smarket.df$Direction)
## Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...
# seems like "Up" -> 1 and "Down" -> 2
head(smarket.df$Direction)
## [1] Up Up Down Up Up Up
## Levels: Down Up
head(as.integer(smarket.df$Direction))
## [1] 2 2 1 2 2 2
# split to train and test
(n <- nrow(smarket.df))
## [1] 1250
(no.of.train <- round(0.7*n, 3))
## [1] 875
train.idx <- sample(seq_len(n), no.of.train)
test.idx <- setdiff(seq_len(n), train.idx)
stopifnot(identical(length(train.idx) + length(test.idx), n))
train.df <- smarket.df[train.idx, ]
test.df <- smarket.df[test.idx, ]
# ----------------------Gradiant boosting -----------------------
# Boosting hyperparameters:
# -------------------------
#
# Number of trees: The total number of trees in the sequence or ensemble.
# Learning rate (shrinkage): the smaller this value, the more accurate the model can
# be but also will require more trees in the sequence.
#
#
# Tree hyperparameters:
# ----------------------
#
# Tree depth: Controls the depth of the individual trees. Typical values range from a
# depth of 3–8 but it is not uncommon to see a tree depth of 1 Note that larger n or p
# training data sets are more tolerable to deeper trees.
#
# Minimum number of observations in terminal nodes: Controls the complexity of each tree.
# Typical values range from 5–15 where higher values help prevent a model from learning
# relationships which might be highly specific to the particular sample selected
# for a tree (overfitting) but smaller values can help with imbalanced target classes in classification problems.
# "expand.grid" create a Data Frame from All Combinations of Factor Variables
hyper_grid <- expand.grid(
learning_rate = c(0.3, 0.1, 0.05, 0.01, 0.005), # values between 0.05–0.2 should work across a wide range of problems
RMSE = NA,
trees = NA,
time = NA
)
for(i in seq_len(nrow(hyper_grid))){
set.seed(123)
train_time <- system.time({
m <- gbm(
formula = Direction.01 ~ . -Direction ,
data = train.df,
distribution = "bernoulli",
n.trees = 5000,
shrinkage = hyper_grid$learning_rate[i],
interaction.depth = 3,
n.minobsinnode = 10,
cv.folds = 10
)
})
# add SSE, trees, and training time to results
hyper_grid$RMSE[i] <- sqrt(min(m$cv.error))
hyper_grid$trees[i] <- which.min(m$cv.error)
hyper_grid$Time[i] <- train_time[["elapsed"]]
}
# The optimal value for learning rate (i.e shrinkage parameter) 0.3
arrange(hyper_grid, RMSE)
# Next, we’ll set our learning rate at the optimal level (0.3) and tune
# the tree specific hyperparameters (interaction.depth and n.minobsinnode).
# Adjusting the tree-specific parameters provides us with an additional 600 reduction in RMSE.
# search grid
hyper_grid <- expand.grid(
n.trees = 6000,
shrinkage = 0.3,
interaction.depth = c(3, 5, 7),
n.minobsinnode = c(5, 10, 15)
)
model_fit <- function(n.trees, shrinkage, interaction.depth, n.minobsinnode) {
set.seed(123)
m <- gbm(
formula = Direction.01 ~ . -Direction,
data = train.df,
distribution = "bernoulli",
n.trees = n.trees,
shrinkage = shrinkage,
interaction.depth = interaction.depth,
n.minobsinnode = n.minobsinnode,
cv.folds = 10
)
# compute RMSE
sqrt(min(m$cv.error))
}
# perform search grid with functional programming
hyper_grid$rmse <- purrr::pmap_dbl(
hyper_grid,
~ model_fit(
n.trees = ..1,
shrinkage = ..2,
interaction.depth = ..3,
n.minobsinnode = ..4
)
)
# Top row RMSE shows it is reduced to 7.332922e-07
arrange(hyper_grid, rmse)
# thus we choose values of n.trees -> 6000, interaction.depth -> 3,
# and n.minobsinnode -> 5 from the top row as the parameters of
# top model so far.
# Now to improve the model we took our top model’s hyperparameter settings,
m <- gbm(
formula = Direction.01 ~ . -Direction,
data = train.df,
distribution = "bernoulli",
n.trees = 10000,
shrinkage = 0.06,
interaction.depth = 3,
n.minobsinnode = 5,
cv.folds = 10
)
# compute RMSE
sqrt(min(m$cv.error))
## [1] 3.274839e-06
# Seems like no more improvenebt is possible thus the following are best set of
# hyperparameters :
# n.trees = 6000,
# shrinkage = 0.3,
# interaction.depth = 3,
# n.minobsinnode = 5,
# Stochastic hyperparameters:
# ---------------------------
# _Subsample rows before creating each tree (available in gbm, h2o, & xgboost)
# _Subsample columns before creating each tree (h2o & xgboost)
# _Subsample columns before considering each split in each tree (h2o & xgboost)
#
# Generally, aggressive subsampling of rows, such as selecting only 50% or less of
# the training data, has shown to be beneficial and typical values range between 0.5–0.8.
# Now let's use h2o for grid search to implement a stochastic GBM
# We use the optimal hyperparameters above and build onto this by assessing a
# range of values for subsampling rows and columns before each tree is built,
# and subsampling columns before each split.
# To speed up training we use early stopping for the individual GBM modeling
# process and also add a stochastic search criteria.
# initial h2o setup
h2o.no_progress()
h2o.init(ip = "localhost", port = 54321, nthreads= -1, max_mem_size = "10g")
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 hours 4 minutes
## H2O cluster timezone: America/Toronto
## H2O data parsing timezone: UTC
## H2O cluster version: 3.28.0.2
## H2O cluster version age: 7 months !!!
## H2O cluster name: H2O_started_from_R_shahrdadshadab_mxj391
## H2O cluster total nodes: 1
## H2O cluster total memory: 8.55 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 3.6.2 (2019-12-12)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is too old (7 months)!
## Please download and install the latest version from http://h2o.ai/download/
# prostate_path <- system.file("extdata", "prostate.csv", package = "h2o")
# prostate <- h2o.uploadFile(path = prostate_path)
h2o.removeAll()
train_h2o <- as.h2o(train.df)
test_h2o <- as.h2o(test.df)
# note that for classification gbm in h2o we have to use factor response
# quite oposite to R's gbm package that we must use numeric 0 , 1 as response variable
response <- "Direction"
predictors <- setdiff(colnames(train.df), c(response, "Direction.01"))
# refined hyperparameter grid (you must use list)
hyper_grid <- list(
sample_rate = c(0.5, 0.75, 1), # row subsampling
col_sample_rate = c(0.5, 0.75, 1), # col subsampling for each split
col_sample_rate_per_tree = c(0.5, 0.75, 1) # col subsampling for each tree
)
# random grid search strategy (you must use list)
search_criteria <- list(
strategy = "RandomDiscrete",
stopping_metric = "mse",
stopping_tolerance = 0.001, # stop if improvement is less than 0.05% in over all OOB error
stopping_rounds = 10, # over last 10 trees
max_runtime_secs = 60*5 # or stop search after 5 minutes
)
# perform grid search
grid <- h2o.grid(
algorithm = "gbm",
grid_id = "gbm_grid",
x = predictors,
y = response,
training_frame = train_h2o,
hyper_params = hyper_grid,
ntrees = 6000,
learn_rate = 0.3,
max_depth = 3,
min_rows = 5,
nfolds = 10,
stopping_rounds = 10,
stopping_tolerance = 0,
search_criteria = search_criteria,
seed = 123
)
# collect the results and sort by our model performance metric of choice
grid_perf <- h2o.getGrid(
grid_id = "gbm_grid",
sort_by = "mse",
decreasing = FALSE
)
grid_perf
## H2O Grid Details
## ================
##
## Grid ID: gbm_grid
## Used hyper parameters:
## - col_sample_rate
## - col_sample_rate_per_tree
## - sample_rate
## Number of models: 24
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by increasing mse
## col_sample_rate col_sample_rate_per_tree sample_rate model_ids
## 1 0.5 0.75 0.5 gbm_grid_model_21
## 2 0.5 1.0 0.75 gbm_grid_model_10
## 3 0.5 1.0 1.0 gbm_grid_model_3
## 4 0.75 1.0 1.0 gbm_grid_model_11
## 5 0.5 1.0 0.5 gbm_grid_model_12
## mse
## 1 0.001121594179032706
## 2 0.001125135863063456
## 3 0.0011325615576201725
## 4 0.0011352998866518348
## 5 0.0011383758400595574
##
## ---
## col_sample_rate col_sample_rate_per_tree sample_rate model_ids
## 19 0.75 0.5 1.0 gbm_grid_model_7
## 20 0.5 0.5 1.0 gbm_grid_model_20
## 21 0.75 0.5 0.75 gbm_grid_model_13
## 22 1.0 0.5 1.0 gbm_grid_model_9
## 23 0.5 0.5 0.75 gbm_grid_model_15
## 24 1.0 0.5 0.75 gbm_grid_model_24
## mse
## 19 0.003389857735489425
## 20 0.003421288880241799
## 21 0.004013881611597751
## 22 0.004561166107787774
## 23 0.005851238329674766
## 24 0.11648661527907224
# Grab the model_id for the top model, chosen by cross validation error
best_model_id <- grid_perf@model_ids[[1]]
best_model <- h2o.getModel(best_model_id)
# variable importance plot
vip::vip(best_model)
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
# Now let’s get performance metrics on the best model
h2o.giniCoef(best_model)
## [1] 1
h2o.performance(model = best_model, xval = TRUE)
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 10-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.001121594
## RMSE: 0.03349021
## LogLoss: 0.005421611
## Mean Per-Class Error: 0
## AUC: 1
## AUCPR: 0.2068966
## Gini: 1
## R^2: 0.9954971
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## Down Up Error Rate
## Down 411 0 0.000000 =0/411
## Up 0 464 0.000000 =0/464
## Totals 411 464 0.000000 =0/875
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.990742 1.000000 64
## 2 max f2 0.990742 1.000000 64
## 3 max f0point5 0.990742 1.000000 64
## 4 max accuracy 0.990742 1.000000 64
## 5 max precision 1.000000 1.000000 0
## 6 max recall 0.990742 1.000000 64
## 7 max specificity 1.000000 1.000000 0
## 8 max absolute_mcc 0.990742 1.000000 64
## 9 max min_per_class_accuracy 0.990742 1.000000 64
## 10 max mean_per_class_accuracy 0.990742 1.000000 64
## 11 max tns 1.000000 411.000000 0
## 12 max fns 1.000000 96.000000 0
## 13 max fps 0.000000 411.000000 399
## 14 max tps 0.990742 464.000000 64
## 15 max tnr 1.000000 1.000000 0
## 16 max fnr 1.000000 0.206897 0
## 17 max fpr 0.000000 1.000000 399
## 18 max tpr 0.990742 1.000000 64
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
summary(best_model)
## Model Details:
## ==============
##
## H2OBinomialModel: gbm
## Model Key: gbm_grid_model_21
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 754 754 82070 1
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 3 1.91247 2 8 3.92042
##
## H2OBinomialMetrics: gbm
## ** Reported on training data. **
##
## MSE: 3.380832e-34
## RMSE: 1.838704e-17
## LogLoss: 1.015061e-18
## Mean Per-Class Error: 0
## AUC: 1
## AUCPR: 0.006465517
## Gini: 1
## R^2: 1
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## Down Up Error Rate
## Down 411 0 0.000000 =0/411
## Up 3 461 0.006466 =3/464
## Totals 414 461 0.003429 =3/875
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 1.000000 1.000000 2
## 2 max f2 1.000000 1.000000 2
## 3 max f0point5 1.000000 1.000000 2
## 4 max accuracy 1.000000 1.000000 2
## 5 max precision 1.000000 1.000000 0
## 6 max recall 1.000000 1.000000 2
## 7 max specificity 1.000000 1.000000 0
## 8 max absolute_mcc 1.000000 1.000000 2
## 9 max min_per_class_accuracy 1.000000 1.000000 2
## 10 max mean_per_class_accuracy 1.000000 1.000000 2
## 11 max tns 1.000000 411.000000 0
## 12 max fns 1.000000 3.000000 0
## 13 max fps 0.000000 411.000000 4
## 14 max tps 1.000000 464.000000 2
## 15 max tnr 1.000000 1.000000 0
## 16 max fnr 1.000000 0.006466 0
## 17 max fpr 0.000000 1.000000 4
## 18 max tpr 1.000000 1.000000 2
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
##
## H2OBinomialMetrics: gbm
## ** Reported on cross-validation data. **
## ** 10-fold cross-validation on training data (Metrics computed for combined holdout predictions) **
##
## MSE: 0.001121594
## RMSE: 0.03349021
## LogLoss: 0.005421611
## Mean Per-Class Error: 0
## AUC: 1
## AUCPR: 0.2068966
## Gini: 1
## R^2: 0.9954971
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## Down Up Error Rate
## Down 411 0 0.000000 =0/411
## Up 0 464 0.000000 =0/464
## Totals 411 464 0.000000 =0/875
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.990742 1.000000 64
## 2 max f2 0.990742 1.000000 64
## 3 max f0point5 0.990742 1.000000 64
## 4 max accuracy 0.990742 1.000000 64
## 5 max precision 1.000000 1.000000 0
## 6 max recall 0.990742 1.000000 64
## 7 max specificity 1.000000 1.000000 0
## 8 max absolute_mcc 0.990742 1.000000 64
## 9 max min_per_class_accuracy 0.990742 1.000000 64
## 10 max mean_per_class_accuracy 0.990742 1.000000 64
## 11 max tns 1.000000 411.000000 0
## 12 max fns 1.000000 96.000000 0
## 13 max fps 0.000000 411.000000 399
## 14 max tps 0.990742 464.000000 64
## 15 max tnr 1.000000 1.000000 0
## 16 max fnr 1.000000 0.206897 0
## 17 max fpr 0.000000 1.000000 399
## 18 max tpr 0.990742 1.000000 64
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
## Cross-Validation Metrics Summary:
## mean sd cv_1_valid cv_2_valid cv_3_valid cv_4_valid
## accuracy 1.0 0.0 1.0 1.0 1.0 1.0
## auc 1.0 0.0 1.0 1.0 1.0 1.0
## aucpr 0.7599853 0.38075897 0.975 0.97619045 0.9811321 0.9756098
## err 0.0 0.0 0.0 0.0 0.0 0.0
## err_count 0.0 0.0 0.0 0.0 0.0 0.0
## cv_5_valid cv_6_valid cv_7_valid cv_8_valid cv_9_valid cv_10_valid
## accuracy 1.0 1.0 1.0 1.0 1.0 1.0
## auc 1.0 1.0 1.0 1.0 1.0 1.0
## aucpr 0.97959185 0.9787234 0.6875 0.071428575 0.9302326 0.044444446
## err 0.0 0.0 0.0 0.0 0.0 0.0
## err_count 0.0 0.0 0.0 0.0 0.0 0.0
##
## ---
## mean sd cv_1_valid cv_2_valid cv_3_valid
## pr_auc 0.7599853 0.38075897 0.975 0.97619045 0.9811321
## precision 1.0 0.0 1.0 1.0 1.0
## r2 0.99581623 0.013228734 1.0 1.0 0.999996
## recall 1.0 0.0 1.0 1.0 1.0
## rmse 0.010346935 0.032265965 9.8665826E-11 3.373171E-7 9.813164E-4
## specificity 1.0 0.0 1.0 1.0 1.0
## cv_4_valid cv_5_valid cv_6_valid cv_7_valid cv_8_valid
## pr_auc 0.9756098 0.97959185 0.9787234 0.6875 0.071428575
## precision 1.0 1.0 1.0 1.0 1.0
## r2 0.99999994 0.9581666 1.0 1.0 0.9999999
## recall 1.0 1.0 1.0 1.0 1.0
## rmse 1.3877063E-4 0.10217344 6.9318458E-9 6.078576E-12 1.7548395E-4
## specificity 1.0 1.0 1.0 1.0 1.0
## cv_9_valid cv_10_valid
## pr_auc 0.9302326 0.044444446
## precision 1.0 1.0
## r2 1.0 1.0
## recall 1.0 1.0
## rmse 3.0627751E-9 1.2293739E-14
## specificity 1.0 1.0
##
## Scoring History:
## timestamp duration number_of_trees training_rmse
## 1 2020-08-21 17:35:15 4 min 49.555 sec 0 0.49908
## 2 2020-08-21 17:35:15 4 min 49.558 sec 1 0.49253
## 3 2020-08-21 17:35:15 4 min 49.560 sec 2 0.35008
## 4 2020-08-21 17:35:15 4 min 49.562 sec 3 0.25795
## 5 2020-08-21 17:35:15 4 min 49.564 sec 4 0.19000
## training_logloss training_auc training_pr_auc training_lift
## 1 0.69131 0.50000 0.00000 1.00000
## 2 0.67817 0.58774 0.57146 1.11212
## 3 0.42935 0.99870 0.94885 1.88578
## 4 0.29601 0.99852 0.99365 1.88578
## 5 0.20819 0.99971 0.95663 1.88578
## training_classification_error
## 1 0.46971
## 2 0.45829
## 3 0.00114
## 4 0.00114
## 5 0.00114
##
## ---
## timestamp duration number_of_trees training_rmse
## 492 2020-08-21 17:35:19 4 min 53.518 sec 491 0.00000
## 493 2020-08-21 17:35:19 4 min 53.527 sec 492 0.00000
## 494 2020-08-21 17:35:19 4 min 53.535 sec 493 0.00000
## 495 2020-08-21 17:35:19 4 min 53.544 sec 494 0.00000
## 496 2020-08-21 17:35:19 4 min 53.553 sec 495 0.00000
## 497 2020-08-21 17:35:20 4 min 53.726 sec 754 0.00000
## training_logloss training_auc training_pr_auc training_lift
## 492 0.00000 1.00000 0.00647 1.88578
## 493 0.00000 1.00000 0.00862 1.88578
## 494 0.00000 1.00000 0.00431 1.88578
## 495 0.00000 1.00000 0.00862 1.88578
## 496 0.00000 1.00000 0.00431 1.88578
## 497 0.00000 1.00000 0.00647 1.88578
## training_classification_error
## 492 0.00000
## 493 0.00000
## 494 0.00000
## 495 0.00000
## 496 0.00000
## 497 0.00000
##
## Variable Importances: (Extract with `h2o.varimp`)
## =================================================
##
## Variable Importances:
## variable relative_importance scaled_importance percentage
## 1 Today 252.705246 1.000000 0.947556
## 2 X 3.534384 0.013986 0.013253
## 3 Volume 2.454071 0.009711 0.009202
## 4 Year 2.439800 0.009655 0.009148
## 5 Lag1 2.343328 0.009273 0.008787
## 6 Lag2 1.668701 0.006603 0.006257
## 7 Lag4 1.261423 0.004992 0.004730
## 8 Lag5 0.266561 0.001055 0.001000
## 9 Lag3 0.018097 0.000072 0.000068
# pred <- h2o.predict(object=best_model, newdata=test_h2o)
# predict that a person will make a purchase if estimated probability of purchase > 0.5
pred.boost.test <- predict(best_model, newdata = test_h2o, n.trees = 6000)
pred.boost.test <- as_tibble(pred.boost.test)
# Since we encoded Yes as 1 and No as 0 we have
stopifnot(identical(length(pred.boost.test$predict),length(test.df$Direction)))
#Confusion Matrix:
(confusion_table <- table(pred.boost.test$predict, test.df$Direction))
##
## Down Up
## Down 191 8
## Up 0 176
# TN: observes as 0 and predicted as 0 (No of people that predicted Not to make any purchase and they really did not)
true.negative <- confusion_table[1,1]
# TP: observes as 1 and predicted as 1 (No of people that predicted to make a purchase and in reality they did)
true.positive <- confusion_table[2,2]
# FP: observes as 0 and predicted as 1 (No of people that predicted to make a purchase but they did not)
false.positive <- confusion_table[2,1]
# FN: observed as 1 and predicted as 0 (No of people that predicted they did not make a purchase but they in realty did)
false.negative <- confusion_table[1,2]
observations.total <- true.negative + false.negative + true.positive + false.positive
observed.as.0 <- true.negative + false.positive
observed.as.1 <- false.negative + true.positive
# Random guessing Rate
(nullClassifier <- max(observed.as.0 / observations.total, observed.as.1 / observations.total))
## [1] 0.5093333
# classification Rate:
(classification.rate <- mean(pred.boost.test$predict == test.df$Direction))
## [1] 0.9786667
# Test error Rate:
(missclassificationRate <- 1 - classification.rate)
## [1] 0.02133333
# False Positive Rate (or: Type I error , 1 - specificty):
(FP_rates <- false.positive / observed.as.0)
## [1] 0
# True Positive Rate (or: 1 - Type II error , power , sesetivity , recall):
(TP_rates <- true.positive / observed.as.1)
## [1] 0.9565217
# Precision (Accuracy Rate, 1 - false discovery proportion):
# Model predicted that Two people make a purchase but in reality only one did (50%)
(precisions <- true.positive / (true.positive + false.positive))
## [1] 1
# Specificity
(specificities <- 1 - false.positive/(false.positive + true.negative))
## [1] 1
library(pROC)
roc_obj <- roc(unlist(test.df$Direction), pred.boost.test$Up)
## Setting levels: control = Down, case = Up
## Setting direction: controls < cases
# Let's draw some AUC plots
plot(roc_obj, legacy.axes = TRUE)
library(tidyverse) # for data wrangling
library(doParallel) # for parallel backend to foreach
library(foreach) # for parallel processing with for loops
library(dataPreparation)
# Modeling packages
library(caret) # for general model fitting
library(rpart) # for fitting decision trees
library(ipred) # for fitting bagged decision trees
options(warn = 1)
set.seed(1113)
smarket.df =
read.csv("/Users/shahrdadshadab/env/my-R-project/ISLR/Data/Smarket.csv",
header=T, stringsAsFactors = F, na.strings = "?")
str(smarket.df)
## 'data.frame': 1250 obs. of 10 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Year : int 2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
## $ Lag1 : num 0.381 0.959 1.032 -0.623 0.614 ...
## $ Lag2 : num -0.192 0.381 0.959 1.032 -0.623 ...
## $ Lag3 : num -2.624 -0.192 0.381 0.959 1.032 ...
## $ Lag4 : num -1.055 -2.624 -0.192 0.381 0.959 ...
## $ Lag5 : num 5.01 -1.055 -2.624 -0.192 0.381 ...
## $ Volume : num 1.19 1.3 1.41 1.28 1.21 ...
## $ Today : num 0.959 1.032 -0.623 0.614 0.213 ...
## $ Direction: chr "Up" "Up" "Down" "Up" ...
# remove constant variables
constant_cols <- whichAreConstant(smarket.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(smarket.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(smarket.df))
## [1] "whichAreBijection: it took me 0.06s to identify 0 column(s) to drop."
## integer(0)
smarket.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 )
smarket.df[- bijections_cols] else smarket.df
smarket.df %>%
filter_all(all_vars(is.na(.)))
# make sure no "NA is in the charachter column Direction"
(na.count <- smarket.df %>%
filter(Direction == "NA") %>%
nrow)
## [1] 0
stopifnot(na.count != numeric(0))
# make sure all are Either "Up" of #Down"
ups <- smarket.df %>%
filter(Direction == "Up") %>%
nrow
downs <- smarket.df %>%
filter(Direction == "Down") %>%
nrow
stopifnot(identical(ups+downs, nrow(smarket.df)))
smarket.df$Direction <- as.factor(smarket.df$Direction)
str(smarket.df$Direction)
## Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...
# seems like "Up" -> 1 and "Down" -> 2
head(smarket.df$Direction)
## [1] Up Up Down Up Up Up
## Levels: Down Up
head(as.integer(smarket.df$Direction))
## [1] 2 2 1 2 2 2
# split to train and test
(n <- nrow(smarket.df))
## [1] 1250
(no.of.train <- round(0.7*n, 3))
## [1] 875
train.idx <- sample(seq_len(n), no.of.train)
test.idx <- setdiff(seq_len(n), train.idx)
stopifnot(identical(length(train.idx) + length(test.idx), n))
train.df <- smarket.df[train.idx, ]
test.df <- smarket.df[test.idx, ]
# ------------------------------------
# apply bagging
# we can use OOB error rate (similar to CV) to find the best number of trees asn you see below
bagging.plot.data.OOB <- tibble(OOB.err = NULL, no.of.trees = NULL, train.time = NULL)
for (no.tree in 50:100){
train_time <- system.time({
smarket.model <- bagging(
formula = Direction ~ .,
data = train.df,
nbagg = no.tree, # number of iterations to include in the bagged model (100 bootstrap replications)
coob = TRUE, # use the OOB error rate
# uses rpart::rpart() build deep trees (no pruning)
# that require just two observations in a node to split
control = rpart.control(minsplit = 2, no.of.trees = no.tree, train_time)
)
})
bagging.plot.data.OOB <- rbind(bagging.plot.data.OOB ,
tibble(OOB.err = smarket.model$err,
no.of.trees = no.tree,
train.time = train_time))
}
bagging.plot.data.OOB
# For al the trees Out-of-bag estimate of misclassification error is zero!!
# Let's apply bagging with 10-fold CV to see how well our ensemble will generalize
smarket.model.cv <- train(
Direction ~ .,
data = train.df,
method = "treebag",
trControl = trainControl(method = "cv", number = 10),
nbagg = 200,
control = rpart.control(minsplit = 2, cp = 0)
)
#----------------- Interpretation of Bagging Model -----------------------
# Bagging improves the prediction accuracy for high variance (and low bias)
# models at the expense of interpretability and computational speed. However,
# using various interpretability algorithms such as VIPs and PDPs,
# we can still make inferences about how our bagged model leverages feature information
# with bagging, since we use many trees built on bootstrapped samples, we are likely
# to see many more features used for splits. Consequently, we tend to have many
# more features involved but with lower levels of importance.
vip::vip(smarket.model.cv, num_features = 5)
# Although the averaging effect of bagging diminishes the ability to interpret the
# final ensemble, PDPs and other interpretability methods help us to
# interpret any “black box” model. plot below highlights the unique, and sometimes
# non-linear, non-monotonic relationships that may exist between a feature and response.
# Construct partial dependence plots
p1 <- pdp::partial(
smarket.model.cv,
pred.var = "Today",
grid.resolution = 20
) %>%
autoplot()
p2 <- pdp::partial(
smarket.model.cv,
pred.var = "Lag1",
grid.resolution = 20
) %>%
autoplot()
gridExtra::grid.arrange(p1, p2, nrow = 1)
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.
## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.
## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.
# get the final plot and do the prediction
smarket.model <- smarket.model.cv$finalModel
# do the prediction
pred.bag.test <- predict(smarket.model, newdata = test.df)
stopifnot(identical(length(pred.bag.test),length(test.df$Direction)))
#Confusion Matrix:
(confusion_table <- table(pred.bag.test, test.df$Direction))
##
## pred.bag.test Down Up
## Down 190 0
## Up 1 184
# TN: observes as 0 and predicted as 0 (No of people that predicted Not to make any purchase and they really did not)
true.negative <- confusion_table[1,1]
# TP: observes as 1 and predicted as 1 (No of people that predicted to make a purchase and in reality they did)
true.positive <- confusion_table[2,2]
# FP: observes as 0 and predicted as 1 (No of people that predicted to make a purchase but they did not)
false.positive <- confusion_table[2,1]
# FN: observed as 1 and predicted as 0 (No of people that predicted they did not make a purchase but they in realty did)
false.negative <- confusion_table[1,2]
observations.total <- true.negative + false.negative + true.positive + false.positive
observed.as.0 <- true.negative + false.positive
observed.as.1 <- false.negative + true.positive
# Random guessing Rate
(nullClassifier <- max(observed.as.0 / observations.total, observed.as.1 / observations.total))
## [1] 0.5093333
# classification Rate:
(classification.rate <- mean(pred.bag.test == test.df$Direction))
## [1] 0.9973333
# Test error Rate:
(missclassificationRate <- 1 - classification.rate)
## [1] 0.002666667
# False Positive Rate (or: Type I error , 1 - specificty):
(FP_rates <- false.positive / observed.as.0)
## [1] 0.005235602
# True Positive Rate (or: 1 - Type II error , power , sesetivity , recall):
(TP_rates <- true.positive / observed.as.1)
## [1] 1
# Precision (Accuracy Rate, 1 - false discovery proportion):
# Model predicted that Two people make a purchase but in reality only one did (50%)
(precisions <- true.positive / (true.positive + false.positive))
## [1] 0.9945946
# Specificity
(specificities <- 1 - false.positive/(false.positive + true.negative))
## [1] 0.9947644
# -------------------- Parallelizing Bagging -----------------------------
# Since bagging is essentially paralle algorithm we can use some parallelism here
# Create a parallel socket cluster
# cl <- makeCluster(8) # use 8 workers
# registerDoParallel(cl) # register the parallel backend
#
# # Fit trees in parallel and compute predictions on the test set
# predictions <- foreach(
# icount(160),
# .packages = "rpart",
# .combine = cbind
# ) %dopar% {
# # bootstrap copy of training data
# index <- sample(nrow(train.df), replace = TRUE)
# train_boot <- train.df[index, ]
#
# # fit tree to bootstrap copy
# bagged_tree <- rpart(
# Direction ~ .,
# control = rpart.control(minsplit = 2, cp = 0),
# data = train_boot
# )
#
# predict(bagged_tree, newdata = test.df)
# }
#
# # The result is 375 predictions (each for one row in test.df)
# # performed by 160 trees (2*160columns , one for "Up" and one for "Down")
# predictions %>%
# as.data.frame()
#
# # We need to calculate RMSE
# predictions %>%
# as.data.frame %>%
# pmap(~sum(c(...))) %>%
# unlist() %>%
# tibble(Ups = . , Downs = ncol(predictions) - .) %>%
# mutate(
# observation = 1:n(),
# actual = test.df$Direction)
#
# stopCluster(cl)
library(tidyverse) # for data wrangling
library(doParallel) # for parallel backend to foreach
library(dataPreparation)
# Modeling packages
library(ranger) # a c++ implementation of random forest
##
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
##
## importance
library(h2o) # a java-based implementation of random forest
options(warn = 1)
options(warn = 1)
set.seed(1113)
smarket.df =
read.csv("/Users/shahrdadshadab/env/my-R-project/ISLR/Data/Smarket.csv",
header=T, stringsAsFactors = F, na.strings = "?")
str(smarket.df)
## 'data.frame': 1250 obs. of 10 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Year : int 2001 2001 2001 2001 2001 2001 2001 2001 2001 2001 ...
## $ Lag1 : num 0.381 0.959 1.032 -0.623 0.614 ...
## $ Lag2 : num -0.192 0.381 0.959 1.032 -0.623 ...
## $ Lag3 : num -2.624 -0.192 0.381 0.959 1.032 ...
## $ Lag4 : num -1.055 -2.624 -0.192 0.381 0.959 ...
## $ Lag5 : num 5.01 -1.055 -2.624 -0.192 0.381 ...
## $ Volume : num 1.19 1.3 1.41 1.28 1.21 ...
## $ Today : num 0.959 1.032 -0.623 0.614 0.213 ...
## $ Direction: chr "Up" "Up" "Down" "Up" ...
# remove constant variables
constant_cols <- whichAreConstant(smarket.df)
## [1] "whichAreConstant: it took me 0s to identify 0 constant column(s)"
# remove Variables that are in double (for example col1 == col2)
(double_cols <- whichAreInDouble(smarket.df))
## [1] "whichAreInDouble: it took me 0s to identify 0 column(s) to drop."
## integer(0)
# remove Variables that are exact bijections (for example col1 = A, B, B, A and col2 = 1, 2, 2, 1)
(bijections_cols <- whichAreBijection(smarket.df))
## [1] "whichAreBijection: it took me 0.05s to identify 0 column(s) to drop."
## integer(0)
smarket.df <- if (!is.null(bijections_cols) & length(bijections_cols) > 0 )
smarket.df[- bijections_cols] else smarket.df
smarket.df %>%
filter_all(all_vars(is.na(.)))
# make sure no "NA is in the charachter column Direction"
na.count <- smarket.df %>%
filter(Direction == "NA") %>%
nrow
stopifnot(na.count != numeric(0))
# make sure all are Either "Up" of #Down"
ups <- smarket.df %>%
filter(Direction == "Up") %>%
nrow
downs <- smarket.df %>%
filter(Direction == "Down") %>%
nrow
stopifnot(identical(ups+downs, nrow(smarket.df)))
smarket.df$Direction <- as.factor(smarket.df$Direction)
str(smarket.df$Direction)
## Factor w/ 2 levels "Down","Up": 2 2 1 2 2 2 1 2 2 2 ...
# seems like "Up" -> 1 and "Down" -> 2
head(smarket.df$Direction)
## [1] Up Up Down Up Up Up
## Levels: Down Up
head(as.integer(smarket.df$Direction))
## [1] 2 2 1 2 2 2
# split to train and test
n <- nrow(smarket.df)
no.of.train <- round(0.7*n, 3)
train.idx <- sample(seq_len(n), no.of.train)
test.idx <- setdiff(seq_len(n), train.idx)
stopifnot(identical(length(train.idx) + length(test.idx), n))
train.df <- smarket.df[train.idx, ]
test.df <- smarket.df[test.idx, ]
# ----------------------------------------------
# --------------- Random forest ----------------
# By default, ranger sets the mtry parameter to floor(sqrt(number of features))
# however, for regression problems the preferred mtry to start with isfloor(number of features/3).
# We also set respect.unordered.factors = "order". This specifies how to treat
# unordered factor variables and we recommend setting this to “order”.
# number of features
n_features <- length(setdiff(names(train.df), "Direction"))
# train a default random forest model
smarket_rf1 <- ranger(
Direction ~ .,
data = train.df,
mtry = floor(sqrt(n_features)),
respect.unordered.factors = "order",
seed = 123
)
# get OOB RMSE
(default_rmse <- sqrt(smarket_rf1$prediction.error))
## [1] 0
# The main hyperparameters to consider include:
# ----------------------------------------------
# _The number of trees in the forest
# _The number of features to consider at any given split: mtry (have the largest impact on predictive accuracy)
# _The complexity of each tree (node size: one for classification and five for regression,
# max depth, max number of terminal nodes, or the required node size to allow additional splits)
# _The sampling scheme (by default each bootstrap copy has the same size as the original training data)
# if you have categories that are not balanced, sampling without replacement provides
# a less biased use of all levels across the trees in the random forest.
# _The splitting rule to use during tree construction (used primarily to increase computational efficiency)
# To increase computational efficiency, splitting rules can be randomized where only a random subset of
# possible splitting values is considered for a variable.
# If only a single random splitting value is randomly selected then we call this procedure extremely randomized trees.
# Due to the added # randomness of split points, this method tends to have no improvement,
# or often a negative impact, on predictive accuracy.
# Let's do Cartesian grid searches where we assess every combination of hyperparameters of interest:
hyper_grid <- expand.grid(
mtry = floor(n_features * c(.05, .15, .25, .333, .4)),
min.node.size = c(1, 3, 5, 10),
replace = c(TRUE, FALSE),
sample.fraction = c(.5, .63, .8),
rmse = NA
)
# execute full cartesian grid search
for(i in seq_len(nrow(hyper_grid))) {
# fit model for ith hyperparameter combination
fit <- ranger(
formula = Direction ~ .,
data = train.df,
num.trees = n_features * 10,
mtry = hyper_grid$mtry[i],
min.node.size = hyper_grid$min.node.size[i],
replace = hyper_grid$replace[i],
sample.fraction = hyper_grid$sample.fraction[i],
verbose = FALSE,
seed = 123,
respect.unordered.factors = 'order',
)
# export OOB error
hyper_grid$rmse[i] <- sqrt(fit$prediction.error)
}
# assess top 10 models:
hyper_grid %>%
arrange(rmse) %>%
mutate(perc_gain = (default_rmse - rmse) / default_rmse * 100) %>%
head(10)
# seems like sampleing only 50% of train.df with replacements consistently performs the best.
# Also mtry = 2 and min.node.size = 1 are good choices
# ------------------- Fit Random forest with h2o ---------------------
h2o.no_progress()
h2o.init(ip = "localhost", port = 54321, nthreads= -1, max_mem_size = "10g")
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 hours 11 minutes
## H2O cluster timezone: America/Toronto
## H2O data parsing timezone: UTC
## H2O cluster version: 3.28.0.2
## H2O cluster version age: 7 months !!!
## H2O cluster name: H2O_started_from_R_shahrdadshadab_mxj391
## H2O cluster total nodes: 1
## H2O cluster total memory: 8.12 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 3.6.2 (2019-12-12)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is too old (7 months)!
## Please download and install the latest version from http://h2o.ai/download/
h2o.removeAll()
h2o.train <- as.h2o(train.df)
h2o.test <- as.h2o(test.df)
response <- "Direction"
predictors <- setdiff(colnames(train.df), response)
# fit default randomforest model with h2o
h2o.smarket_rf1 <- h2o.randomForest(
x = predictors,
y = response,
training_frame = h2o.train,
ntrees = n_features*10,
seed = 123
)
h2o.smarket_rf1
## Model Details:
## ==============
##
## H2OBinomialModel: drf
## Model ID: DRF_model_R_1598041551527_127807
## Model Summary:
## number_of_trees number_of_internal_trees model_size_in_bytes min_depth
## 1 90 90 27537 1
## max_depth mean_depth min_leaves max_leaves mean_leaves
## 1 17 7.12222 2 81 19.65556
##
##
## H2OBinomialMetrics: drf
## ** Reported on training data. **
## ** Metrics reported on Out-Of-Bag training samples **
##
## MSE: 0.003180651
## RMSE: 0.05639726
## LogLoss: 0.03548078
## Mean Per-Class Error: 0
## AUC: 1
## AUCPR: 0.887931
## Gini: 1
## R^2: 0.9872305
##
## Confusion Matrix (vertical: actual; across: predicted) for F1-optimal threshold:
## Down Up Error Rate
## Down 411 0 0.000000 =0/411
## Up 0 464 0.000000 =0/464
## Totals 411 464 0.000000 =0/875
##
## Maximum Metrics: Maximum metrics at their respective thresholds
## metric threshold value idx
## 1 max f1 0.659042 1.000000 188
## 2 max f2 0.659042 1.000000 188
## 3 max f0point5 0.659042 1.000000 188
## 4 max accuracy 0.659042 1.000000 188
## 5 max precision 1.000000 1.000000 0
## 6 max recall 0.659042 1.000000 188
## 7 max specificity 1.000000 1.000000 0
## 8 max absolute_mcc 0.659042 1.000000 188
## 9 max min_per_class_accuracy 0.659042 1.000000 188
## 10 max mean_per_class_accuracy 0.659042 1.000000 188
## 11 max tns 1.000000 411.000000 0
## 12 max fns 1.000000 412.000000 0
## 13 max fps 0.000000 411.000000 262
## 14 max tps 0.659042 464.000000 188
## 15 max tnr 1.000000 1.000000 0
## 16 max fnr 1.000000 0.887931 0
## 17 max fpr 0.000000 1.000000 262
## 18 max tpr 0.659042 1.000000 188
##
## Gains/Lift Table: Extract with `h2o.gainsLift(<model>, <data>)` or `h2o.gainsLift(<model>, valid=<T/F>, xval=<T/F>)`
# Let's perform a larger search with h2o
hyper_grid <- list(
mtries = floor(n_features * c(.12,.15,.25,.333,.4)),
min_rows = c(1,3,5,10),
max_depth = c(10,20,30),
sample_rate = c(.55, .632, .70, .80)
)
# random grid search strategy (you must use list)
search_criteria <- list(
strategy = "RandomDiscrete",
stopping_metric = "mse",
stopping_tolerance = 0.001, # stop if improvement is less than 0.05% in over all OOB error
stopping_rounds = 10, # over last 10 trees
max_runtime_secs = 60*5 # or stop search after 5 minutes
)
# perform grid search
grid <- h2o.grid(
algorithm = "randomForest",
grid_id = "rf_random_grid",
x = predictors,
y = response,
training_frame = h2o.train,
hyper_params = hyper_grid,
ntrees = n_features*10,
stopping_metric = "RMSE",
stopping_rounds = 10,
stopping_tolerance = 0.005,
search_criteria = search_criteria,
seed = 123
)
# collect the results and sort by our model performance metric of choice
grid_perf <- h2o.getGrid(
grid_id = "rf_random_grid",
sort_by = "mse",
decreasing = FALSE
)
# this is the best result:
# max_depth -> 20, min_rows -> 1.0, mtries -> 3, sample_rate -> 0.7 achieved at RMSE of 0.0028
grid_perf
## H2O Grid Details
## ================
##
## Grid ID: rf_random_grid
## Used hyper parameters:
## - max_depth
## - min_rows
## - mtries
## - sample_rate
## Number of models: 240
## Number of failed models: 0
##
## Hyper-Parameter Search Summary: ordered by increasing mse
## max_depth min_rows mtries sample_rate model_ids
## 1 20 1.0 3 0.7 rf_random_grid_model_53
## 2 30 1.0 3 0.7 rf_random_grid_model_77
## 3 20 1.0 3 0.8 rf_random_grid_model_195
## 4 30 1.0 3 0.8 rf_random_grid_model_215
## 5 10 1.0 3 0.7 rf_random_grid_model_100
## mse
## 1 0.0028904270252804376
## 2 0.0028904270252804376
## 3 0.0028944841220222553
## 4 0.0028944841220222553
## 5 0.0029727719123189143
##
## ---
## max_depth min_rows mtries sample_rate model_ids
## 235 30 10.0 1 0.7 rf_random_grid_model_143
## 236 20 10.0 1 0.7 rf_random_grid_model_206
## 237 30 10.0 1 0.7 rf_random_grid_model_208
## 238 20 10.0 1 0.7 rf_random_grid_model_240
## 239 10 10.0 1 0.7 rf_random_grid_model_136
## 240 10 10.0 1 0.7 rf_random_grid_model_31
## mse
## 235 0.07957467846168671
## 236 0.07957467846168671
## 237 0.07957467846168671
## 238 0.07957467846168671
## 239 0.07997928810857083
## 240 0.07997928810857083
# permutation-based Feature Importance measure:
# ----------------------------------------------
# In the permutation-based approach, for each tree, the OOB sample is passed down
# the tree and the prediction accuracy is recorded. Then the values for each variable
# (one at a time) are randomly permuted and the accuracy is again computed.
# The decrease in accuracy as a result of this randomly shuffling of feature values
# is averaged over all the trees for each predictor.
# The variables with the largest average decrease in accuracy are considered most important.
# For ranger, once you’ve identified the optimal parameter values from the grid search,
# you will want to re-run your model with these hyperparameter values.
# You can also crank up the number of trees, which will help create more stables
# values of variable importance.
# re-run model with impurity-based variable importance
rf_impurity <- ranger(
formula = Direction ~ .,
data = train.df,
num.trees = 2000,
mtry = 3,
min.node.size = 1, # i.e. min_rows
sample.fraction = .80,
replace = TRUE,
importance = "impurity",
respect.unordered.factors = "order",
verbose = FALSE,
seed = 123
)
# re-run model with permutation-based variable importance
rf_permutation <- ranger(
formula = Direction ~ .,
data = train.df,
num.trees = 2000,
mtry = 3,
min.node.size = 1, # i.e. min_rows
sample.fraction = .80,
replace = TRUE,
importance = "permutation",
respect.unordered.factors = "order",
verbose = FALSE,
seed = 123
)
# variable importance plot
vip::vip(rf_impurity)
vip::vip(rf_permutation)
# seems like Today is most important feature in both plots
library(tidyverse)
(t <- tibble(x = c("", " NA", "12", "34", " NA", "NA", NA), y = c(NA, "qsfg", "fbf", "12", "NA", "NA ", NA)))
(na.idx <-
t %>%
pmap(~c(...)) %>%
map(~sum(is.na(.) | str_trim(.) == "NA") > 0) %>%
unlist %>%
which()
)
## [1] 1 2 5 6 7
rest.idx <- setdiff(seq_len(nrow(t)) , na.idx)
t[na.idx, ]
t[rest.idx, ]
# ------------------------
# library("lme4")
# library("glmmADMB") ## (not on CRAN)
# library("glmmTMB")
# library("MCMCglmm")
# library("blme")
# library("MASS") ## for glmmPQL (base R)
# library("nlme") ## for intervals(), tundra example (base R)
# ## auxiliary
# library("ggplot2") ## for pretty plots generally
# ## ggplot customization:
# theme_set(theme_bw())
# scale_colour_discrete <- function(...,palette="Set1") {
# scale_colour_brewer(...,palette=palette)
# }
# scale_colour_orig <- ggplot2::scale_colour_discrete
# scale_fill_discrete <- function(...,palette="Set1") {
# scale_fill_brewer(...,palette=palette)
# }
# ## to squash facets together ...
# zmargin <- theme(panel.spacing=grid::unit(0,"lines"))
# library("gridExtra") ## for grid.arrange()
# library("broom.mixed")
# ## n.b. as of 25 Sep 2018, need bbolker github version of dotwhisker ...
# library("dotwhisker")
# library("coda") ## MCMC diagnostics
# library("aods3") ## overdispersion diagnostics
# library("plotMCMC") ## pretty plots from MCMC fits
# library("bbmle") ## AICtab
# library("pbkrtest") ## parametric bootstrap
# library("Hmisc")
# ## for general-purpose reshaping and data manipulation:
# library("reshape2")
# library("plyr")
# ## for illustrating effects of observation-level variance in binary data:
# library("numDeriv")
# library("glmmADMB")
# bb <- glmmADMB:::get_bin_loc()[["bin_loc"]]
# bpath <- gsub("glmmadmb$","",bb)
# file.copy(bb,paste0(bpath,"glmmadmb.bak"))
# bburl <- "http://admb-project.org/buildbot/glmmadmb/"
# download.file(paste0(bburl,
# "glmmadmb-mingw64-r2885-windows8-mingw64.exe"), dest=bb)